Spaces:

lsdf
/

ai-seo-analyzer

Running

lsdf commited on Mar 18

Commit

86b2387

1 Parent(s): d5937ae

Align n-gram selection rules across summary and optimizer.

Apply stage candidate rules for bi/tri-grams and keyword-derived unigrams, add n-gram target rotation to avoid single-term stalls, and sync summary n-gram logic with optimizer tolerance policy.

Made-with: Cursor

Files changed (3) hide show

docs/TEXT_OPTIMIZER_PRINCIPLES.md +17 -0
optimizer.py +63 -8
templates/index.html +40 -11

docs/TEXT_OPTIMIZER_PRINCIPLES.md CHANGED Viewed

@@ -41,6 +41,23 @@ Update it whenever optimization policy changes.
   - if `avg >= 4`: acceptable range is `avg +/- 20%`
   - if `avg < 4`: acceptable range is `avg +/- 50%`
 - N-gram signal is counted only when term is outside tolerance and present in enough competitors.
 ## 6) Local acceptance and batch accumulation

   - if `avg >= 4`: acceptable range is `avg +/- 20%`
   - if `avg < 4`: acceptable range is `avg +/- 50%`
 - N-gram signal is counted only when term is outside tolerance and present in enough competitors.
+- Selection rules (multi-competitor mode, `competitors > 1`):
+  - bi-grams and tri-grams are eligible when present in `>= 2` competitors;
+  - unigrams are eligible only if they are part of user keyword phrases and present in `>= 2` competitors.
+- Iteration behavior:
+  - optimizer may work on one n-gram at a time per step,
+  - if no primary progress on current n-gram target, it rotates to the next eligible n-gram candidate.
+## 5.1 Summary logic memory (current)
+- Summary recommendation triggers:
+  - BERT warning when phrase score `< 0.70`;
+  - BM25 warning when `REMOVE >= 4`;
+  - N-gram warning when term is underrepresented among competitors;
+  - Title warning when Title BERT `< 0.65`;
+  - Semantic warning when keyword terms are weaker than competitor average.
+- For N-grams in summary:
+  - summary renders top rows for readability, but optimizer runs against the full eligible candidate set.
 ## 6) Local acceptance and batch accumulation

optimizer.py CHANGED Viewed

@@ -254,6 +254,38 @@ def _ngram_deviation_ratio(target_count: float, competitor_avg: float) -> float:
     return abs(target_count - competitor_avg) / max(competitor_avg, 1e-6)
 def _chunk_ngram_count(text: str, ngram_label: str, language: str) -> int:
     toks = _filter_stopwords(_tokenize(text), language)
     phrase_toks = _filter_stopwords(_tokenize(ngram_label), language)
@@ -293,11 +325,15 @@ def _compute_metrics(
     ngram_signal_count = 0
     ngram_gap_sum = 0.0
     ngrams = analysis.get("ngram_stats", {}) or {}
-    for bucket_name in ("unigrams", "bigrams"):
-        for item in (ngrams.get(bucket_name) or []):
             comp_occ = int(item.get("comp_occurrence", 0))
-            if comp_occ < min_signal:
                 continue
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
@@ -393,6 +429,7 @@ def _choose_optimization_goal(
     language: str,
     stage: str = "bert",
     bert_stage_target: float = BERT_TARGET_THRESHOLD,
 ) -> Dict[str, Any]:
     candidates: Dict[str, Dict[str, Any]] = {}
     bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
@@ -434,6 +471,8 @@ def _choose_optimization_goal(
     # N-gram balancing (toward competitor average with tolerance policy).
     ngram_rows: List[Tuple[str, float, float, float, int, float]] = []
     ngram_stats = analysis.get("ngram_stats", {}) or {}
     for bucket_name, bucket in ngram_stats.items():
         if not isinstance(bucket, list):
             continue
@@ -444,7 +483,7 @@ def _choose_optimization_goal(
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
             comp_occ = int(item.get("comp_occurrence", 0))
-            if comp_occ < 2:
                 continue
             if not _is_ngram_outside_tolerance(target, comp_avg):
                 continue
@@ -453,7 +492,8 @@ def _choose_optimization_goal(
             ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
     if ngram_rows:
         ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
-        label, target, comp_avg, tol, _, _ = ngram_rows[0]
         candidates["ngram"] = {
             "type": "ngram",
             "label": label,
@@ -465,6 +505,8 @@ def _choose_optimization_goal(
             "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
             "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
             "ngram_direction": "increase" if target < comp_avg else "decrease",
         }
     title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
@@ -1204,6 +1246,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
     queued_candidates: List[Dict[str, Any]] = []
     stage_idx = 0
     stage_no_progress_steps = 0
     for step in range(max_iterations):
         while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
@@ -1223,6 +1266,7 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
             language,
             stage=active_stage,
             bert_stage_target=bert_stage_target,
         )
         if goal["type"] == "none":
             stage_idx += 1
@@ -1545,10 +1589,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                 current_analysis = best_local["analysis"]
                 current_semantic = best_local["semantic"]
                 current_metrics = best_local["metrics"]
-                if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
                     stage_no_progress_steps = 0
                 else:
                     stage_no_progress_steps += 1
                 applied_changes += 1
                 queued_candidates = []
@@ -1687,10 +1734,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                     current_analysis = best_batch["batch_analysis"]
                     current_semantic = best_batch["batch_semantic"]
                     current_metrics = best_batch["batch_metrics"]
-                    if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
                         stage_no_progress_steps = 0
                     else:
                         stage_no_progress_steps += 1
                     applied_changes += 1
                     batch_applied = True
                     batch_info = {
@@ -1809,6 +1859,8 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
                 }
             )
             stage_no_progress_steps += 1
             if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
                 stage_idx += 1
                 stage_no_progress_steps = 0
@@ -1838,10 +1890,13 @@ def optimize_text(request_data: Dict[str, Any]) -> Dict[str, Any]:
         current_analysis = best["analysis"]
         current_semantic = best["semantic"]
         current_metrics = best["metrics"]
-        if _stage_primary_progress(active_stage, prev_metrics, current_metrics):
             stage_no_progress_steps = 0
         else:
             stage_no_progress_steps += 1
         applied_changes += 1
         queued_candidates = []

     return abs(target_count - competitor_avg) / max(competitor_avg, 1e-6)
+def _keyword_unigram_set(keywords: List[str], language: str) -> set:
+    out = set()
+    for kw in keywords:
+        toks = _filter_stopwords(_tokenize(kw), language)
+        for t in toks:
+            out.add(t)
+    return out
+def _is_ngram_stage_candidate(
+    ngram_label: str,
+    comp_occurrence: int,
+    competitor_count: int,
+    keyword_unigrams: set,
+) -> bool:
+    ngram = (ngram_label or "").strip().lower()
+    if not ngram:
+        return False
+    tokens = _tokenize(ngram)
+    n = len(tokens)
+    if competitor_count > 1:
+        if comp_occurrence < 2:
+            return False
+        if n >= 2:
+            # For multi-competitor mode, bi/tri-grams with K>=2 are always candidates.
+            return True
+        # Unigrams are candidates only if they belong to key phrases.
+        return n == 1 and tokens[0] in keyword_unigrams
+    # Single-competitor mode: keep broader eligibility.
+    return comp_occurrence >= 1
 def _chunk_ngram_count(text: str, ngram_label: str, language: str) -> int:
     toks = _filter_stopwords(_tokenize(text), language)
     phrase_toks = _filter_stopwords(_tokenize(ngram_label), language)
     ngram_signal_count = 0
     ngram_gap_sum = 0.0
+    keyword_unigrams = _keyword_unigram_set(keywords, language)
     ngrams = analysis.get("ngram_stats", {}) or {}
+    for bucket in ngrams.values():
+        if not isinstance(bucket, list):
+            continue
+        for item in bucket:
             comp_occ = int(item.get("comp_occurrence", 0))
+            ngram_label = str(item.get("ngram", ""))
+            if not _is_ngram_stage_candidate(ngram_label, comp_occ, competitor_count, keyword_unigrams):
                 continue
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
     language: str,
     stage: str = "bert",
     bert_stage_target: float = BERT_TARGET_THRESHOLD,
+    stage_cursor: int = 0,
 ) -> Dict[str, Any]:
     candidates: Dict[str, Dict[str, Any]] = {}
     bert_details = analysis.get("bert_analysis", {}).get("detailed", []) or []
     # N-gram balancing (toward competitor average with tolerance policy).
     ngram_rows: List[Tuple[str, float, float, float, int, float]] = []
     ngram_stats = analysis.get("ngram_stats", {}) or {}
+    competitor_count = len((analysis.get("word_counts", {}) or {}).get("competitors", []) or [])
+    keyword_unigrams = _keyword_unigram_set(keywords, language)
     for bucket_name, bucket in ngram_stats.items():
         if not isinstance(bucket, list):
             continue
             target = float(item.get("target_count", 0))
             comp_avg = float(item.get("competitor_avg", 0))
             comp_occ = int(item.get("comp_occurrence", 0))
+            if not _is_ngram_stage_candidate(ngram_label, comp_occ, competitor_count, keyword_unigrams):
                 continue
             if not _is_ngram_outside_tolerance(target, comp_avg):
                 continue
             ngram_rows.append((ngram_label, target, comp_avg, tol, comp_occ, dev_ratio))
     if ngram_rows:
         ngram_rows.sort(key=lambda x: (x[5], x[4], x[2]), reverse=True)
+        pick = max(0, int(stage_cursor)) % len(ngram_rows)
+        label, target, comp_avg, tol, _, _ = ngram_rows[pick]
         candidates["ngram"] = {
             "type": "ngram",
             "label": label,
             "ngram_lower_bound": round(comp_avg * (1.0 - tol), 3),
             "ngram_upper_bound": round(comp_avg * (1.0 + tol), 3),
             "ngram_direction": "increase" if target < comp_avg else "decrease",
+            "ngram_rank_index": pick,
+            "ngram_candidates_total": len(ngram_rows),
         }
     title_bert = analysis.get("title_analysis", {}).get("bert", {}) or {}
     queued_candidates: List[Dict[str, Any]] = []
     stage_idx = 0
     stage_no_progress_steps = 0
+    stage_goal_cursor: Dict[str, int] = {}
     for step in range(max_iterations):
         while stage_idx < len(STAGE_ORDER) and _is_stage_complete(
             language,
             stage=active_stage,
             bert_stage_target=bert_stage_target,
+            stage_cursor=int(stage_goal_cursor.get(active_stage, 0)),
         )
         if goal["type"] == "none":
             stage_idx += 1
                 current_analysis = best_local["analysis"]
                 current_semantic = best_local["semantic"]
                 current_metrics = best_local["metrics"]
+                progressed_stage = _stage_primary_progress(active_stage, prev_metrics, current_metrics)
+                if progressed_stage:
                     stage_no_progress_steps = 0
                 else:
                     stage_no_progress_steps += 1
+                    if active_stage == "ngram":
+                        stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
                 applied_changes += 1
                 queued_candidates = []
                     current_analysis = best_batch["batch_analysis"]
                     current_semantic = best_batch["batch_semantic"]
                     current_metrics = best_batch["batch_metrics"]
+                    progressed_stage = _stage_primary_progress(active_stage, prev_metrics, current_metrics)
+                    if progressed_stage:
                         stage_no_progress_steps = 0
                     else:
                         stage_no_progress_steps += 1
+                        if active_stage == "ngram":
+                            stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
                     applied_changes += 1
                     batch_applied = True
                     batch_info = {
                 }
             )
             stage_no_progress_steps += 1
+            if active_stage == "ngram":
+                stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
             if stage_no_progress_steps >= 3 and stage_idx < len(STAGE_ORDER) - 1:
                 stage_idx += 1
                 stage_no_progress_steps = 0
         current_analysis = best["analysis"]
         current_semantic = best["semantic"]
         current_metrics = best["metrics"]
+        progressed_stage = _stage_primary_progress(active_stage, prev_metrics, current_metrics)
+        if progressed_stage:
             stage_no_progress_steps = 0
         else:
             stage_no_progress_steps += 1
+            if active_stage == "ngram":
+                stage_goal_cursor[active_stage] = int(stage_goal_cursor.get(active_stage, 0)) + 1
         applied_changes += 1
         queued_candidates = []

templates/index.html CHANGED Viewed

@@ -1412,22 +1412,51 @@
         // === 3) N-grams: сигнал только если 2+ конкурента ===
         const ngramSignals = [];
-        const ngramBuckets = analysisData.ngram_stats
-            ? [analysisData.ngram_stats.unigrams, analysisData.ngram_stats.bigrams]
-            : [];
-        ngramBuckets.forEach(bucket => {
-            (bucket || []).forEach(item => {
                 const compOcc = Number(item.comp_occurrence || 0);
                 const targetCount = Number(item.target_count || 0);
                 const compAvg = Number(item.competitor_avg || 0);
-                if (compOcc < minCompetitorSignal) return;
-                const ratioSignal = targetCount === 0 ? compAvg > 0 : compAvg >= targetCount * 2;
-                if (!ratioSignal) return;
                 ngramSignals.push({
                     ngram: item.ngram,
                     compOcc,
                     targetCount,
-                    compAvg
                 });
             });
         });
@@ -1439,7 +1468,7 @@
         if (ngramSignals.length > 0) {
             const topSignals = ngramSignals
                 .slice()
-                .sort((a, b) => (b.compOcc - a.compOcc) || (b.compAvg - a.compAvg))
                 .slice(0, 10)
                 .map(x => ({
                     ngram: x.ngram,
@@ -1661,7 +1690,7 @@
         container.innerHTML = `
             <div class="stat-card">
                 <h5 class="card-title mb-3">Итоговые рекомендации (что сделать в первую очередь)</h5>
-                <p class="text-muted small mb-3">Сводка формируется по правилам: BERT &lt; 0.70, BM25 remove ≥ 4, n-граммы с сигналом от 2+ конкурентов, Title BERT &lt; 0.65, Semantic Core-разрыв по словам из ключей.</p>
                 ${recCards}
             </div>
             <div class="stat-card">

         // === 3) N-grams: сигнал только если 2+ конкурента ===
         const ngramSignals = [];
+        const ngramStats = analysisData.ngram_stats || {};
+        const kwUnigrams = new Set();
+        keywordsRaw.forEach(kw => {
+            String(kw || '')
+                .toLowerCase()
+                .replace(/[^\p{L}\p{N}\s-]+/gu, ' ')
+                .split(/\s+/)
+                .map(v => v.trim())
+                .filter(v => v.length >= 2)
+                .forEach(t => kwUnigrams.add(t));
+        });
+        const isOutsideTolerance = (targetCount, compAvg) => {
+            if (compAvg <= 0) return false;
+            const tol = compAvg >= 4 ? 0.20 : 0.50;
+            return targetCount < compAvg * (1 - tol) || targetCount > compAvg * (1 + tol);
+        };
+        const isEligibleNgram = (ngram, compOcc) => {
+            const toks = String(ngram || '')
+                .toLowerCase()
+                .replace(/[^\p{L}\p{N}\s-]+/gu, ' ')
+                .split(/\s+/)
+                .map(v => v.trim())
+                .filter(v => v.length >= 2);
+            if (!toks.length) return false;
+            if (competitorCount > 1) {
+                if (compOcc < 2) return false;
+                if (toks.length >= 2) return true; // bi/tri-grams
+                return kwUnigrams.has(toks[0]); // unigram from keyword phrase
+            }
+            return compOcc >= 1;
+        };
+        Object.values(ngramStats).forEach(bucket => {
+            (Array.isArray(bucket) ? bucket : []).forEach(item => {
                 const compOcc = Number(item.comp_occurrence || 0);
                 const targetCount = Number(item.target_count || 0);
                 const compAvg = Number(item.competitor_avg || 0);
+                if (!isEligibleNgram(item.ngram, compOcc)) return;
+                if (!isOutsideTolerance(targetCount, compAvg)) return;
+                const devRatio = compAvg > 0 ? Math.abs(targetCount - compAvg) / compAvg : 0;
                 ngramSignals.push({
                     ngram: item.ngram,
                     compOcc,
                     targetCount,
+                    compAvg,
+                    devRatio
                 });
             });
         });
         if (ngramSignals.length > 0) {
             const topSignals = ngramSignals
                 .slice()
+                .sort((a, b) => (b.devRatio - a.devRatio) || (b.compOcc - a.compOcc) || (b.compAvg - a.compAvg))
                 .slice(0, 10)
                 .map(x => ({
                     ngram: x.ngram,
         container.innerHTML = `
             <div class="stat-card">
                 <h5 class="card-title mb-3">Итоговые рекомендации (что сделать в первую очередь)</h5>
+                <p class="text-muted small mb-3">Сводка формируется по правилам: BERT &lt; 0.70, BM25 remove ≥ 4, n-граммы по допускам (±20% при Avg≥4, ±50% при Avg&lt;4) с фильтром K>=2 для multi-competitor, Title BERT &lt; 0.65, Semantic Core-разрыв по словам из ключей.</p>
                 ${recCards}
             </div>
             <div class="stat-card">