Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Build error

App Files Files Community

dusan-presswhizz commited on Aug 26, 2025

Commit

ab9bead

verified ·

1 Parent(s): 0ee888e

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -239

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os, re, json, requests, urllib.parse, hashlib, html
 from functools import lru_cache
-from typing import List, Optional
 # Torch / Transformers
 import torch, torch.nn.functional as F
@@ -38,9 +38,9 @@ UA = {
     )
 }
-# --- OpenAI settings ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")
 FALLBACK_OPENAI_MODEL  = "gpt-4o-mini"
 OPENAI_CHAT_URL        = "https://api.openai.com/v1/chat/completions"
@@ -222,6 +222,59 @@ def get_text_blocks(url: str, max_paragraphs: int = 8) -> List[str]:
         print(f"get_text_blocks fatal: {e}")
         return []
 # =========================
 # Embedding helpers
 # =========================
@@ -247,7 +300,6 @@ def embed(texts: List[str]):
 def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     if not sentence or not anchor_text:
         return sentence, False
-    # prefer exact word-boundary replacement if present
     try:
         pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
         if pattern.search(sentence):
@@ -255,195 +307,141 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
             return result, True
     except Exception:
         pass
-    # else append a natural clause
     if len(sentence) > 0 and sentence[-1] in '.!?':
         base, punct = sentence[:-1], sentence[-1]
     else:
         base, punct = sentence, '.'
-    clause = f' with insights from <a href="{target_url}">{anchor_text}</a>'
-    rewritten = f'{base}{clause}{punct}'
     return rewritten, False
 # =========================
-# OpenAI helpers (cached)
 # =========================
-def _openai_chat_cached(cache_key: str, model_name: str, system: str, user_json: dict):
-    if cache_key in API_RESPONSE_CACHE:
-        print(f"[GPT] Using cached response for {cache_key[:8]}...")
-        return API_RESPONSE_CACHE[cache_key]
     if not OPENAI_API_KEY:
         raise RuntimeError("OPENAI_API_KEY not set")
     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
     body = {
         "model": model_name,
-        "response_format": {"type": "json_object"},
         "messages": [
             {"role": "system", "content": system},
-            {"role": "user", "content": json.dumps(user_json)}
-        ],
-        "temperature": 0.6
     }
     r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
     print(f"[GPT] Model={model_name} HTTP {r.status_code}")
     r.raise_for_status()
     txt = r.json()["choices"][0]["message"]["content"]
-    result = json.loads(txt)
-    API_RESPONSE_CACHE[cache_key] = result
-    return result
-def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", language="English"):
-    if not OPENAI_API_KEY:
-        print("[GPT] No OPENAI_API_KEY found → using fallback.")
-        return {"sentence_html": sentence_html}
-    cache_key = hashlib.md5(f"{sentence_html}{anchor_text}{target_url}{style}{language}".encode()).hexdigest()
-    system = (
-        f"You are a skilled content editor writing in {language}. "
-        "Integrate the given anchor naturally into ONE sentence of similar length. "
-        "STRICT: include an <a href> using the EXACT anchor text; no em dashes. "
-        f"Return JSON with key sentence_html."
-    )
-    user = {
-        "task": "rewrite_for_link_insertion",
-        "sentence_html": sentence_html,
-        "anchor_text": anchor_text,
-        "target_url": target_url,
-        "style": style,
-        "language": language,
-        "preserve_special_chars": True,
-        "constraints": {"max_extra_words": 20}
-    }
     try:
-        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
     except Exception as e:
         print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
-        try:
-            obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
-        except Exception as e2:
-            print(f"[GPT] Fallback failed: {e2}. Using original sentence.")
-            return {"sentence_html": sentence_html}
-    out = obj.get("sentence_html", sentence_html)
-    return {"sentence_html": out}
-def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="English"):
     if not OPENAI_API_KEY:
         return {"sentence_html": sentence_html}
-    cache_key = hashlib.md5(f"polish_{sentence_html}{anchor_text}{target_url}{language}".encode()).hexdigest()
     system = (
-        f"You are an advanced editor writing in {language}. "
-        "Input: a draft HTML sentence with an <a> link (anchor text fixed). "
-        "Polish if natural; else rewrite (max 5 sentences). Keep anchor EXACT, href unchanged; no em dashes. "
-        "Return JSON with key 'sentence_html'."
     )
     user = {
         "sentence_html": sentence_html,
         "anchor_text": anchor_text,
         "target_url": target_url,
-        "language": language,
-        "preserve_special_chars": True
     }
-    try:
-        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
-    except Exception:
-        try:
-            obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
-        except Exception:
-            return {"sentence_html": sentence_html}
-    out = obj.get("sentence_html", sentence_html)
-    soup = BeautifulSoup(out, "html.parser")
-    if not soup.find("a"):
-        return {"sentence_html": sentence_html}
-    return {"sentence_html": out}
-def gpt_get_search_keywords(target_content, target_url):
     if not OPENAI_API_KEY:
-        return ["related content", "learn more", "additional information"]
-    content_preview = " ".join(target_content[:5]) if isinstance(target_content, list) else str(target_content)[:3000]
-    cache_key = hashlib.md5(f"keywords_{target_url}_{content_preview[:500]}".encode()).hexdigest()
     system = (
-        "You are an SEO expert. Identify 5-10 realistic search keywords users would type to find this page. "
-        "Return JSON: {'keywords': [...]}"
     )
-    user = {
-        "task": "identify_search_keywords",
-        "page_content": content_preview,
-        "url": target_url,
-        "requirements": {"count": "5-10", "type": "practical"}
-    }
-    try:
-        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
-    except Exception as e:
-        print(f"[GPT] Keywords extraction failed: {e}")
-        return ["related content", "learn more", "additional information"]
-    return obj.get("keywords", ["related content"])
 def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
     if not OPENAI_API_KEY or not keywords:
         return None
     source_preview = " ".join(source_blocks[:3])[:500]
-    cache_key = hashlib.md5(f"generate_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
     system = (
         f"You are a skilled content writer in {language}. Given article paragraphs and keyword candidates "
-        "for a target link, do: 1) choose ONE best keyword; 2) write 1-3 sentences including it as an <a href>; "
-        "3) provide the exact source sentence AFTER WHICH to insert. "
         "Return JSON keys: chosen_keyword, new_content, insert_after_sentence."
     )
     user = {
         "article_paragraphs": source_blocks[:7],
         "available_keywords": keywords,
         "target_url": target_url,
-        "language": language,
-        "requirements": {"natural_flow": True, "include_link": True}
     }
-    try:
-        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
-        return obj
-    except Exception as e:
-        print(f"[GPT] Content generation failed: {e}")
-        try:
-            obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
-            return obj
-        except Exception:
-            return None
 def to_plain_text(html_or_text: str) -> str:
     text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
     return html.unescape(text)
 # =========================
-# Core logic (with ANCHOR-FIRST fix)
 # =========================
 def find_alternative_anchor(blocks, target_url, original_anchor):
     try:
-        print(f"[Alternative] Extracting target page content from {target_url}")
-        target_blocks = get_text_blocks(target_url, max_paragraphs=5)
-        if not target_blocks:
-            print("[Alternative] No content extracted from target page")
-            return None, None
-        keywords = gpt_get_search_keywords(target_blocks, target_url)
-        print(f"[Alternative] Keywords identified: {keywords}")
-        if not keywords or not isinstance(keywords, list):
             return None, None
         source_text = " ".join(blocks[:2])
-        detected_lang = detect_language(source_text)
-        language_name = get_language_name(detected_lang)
-        print(f"[Alternative] Detected language: {language_name}")
         result = gpt_generate_content_with_keyword(
             source_blocks=blocks,
@@ -454,7 +452,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
         if not result:
             return None, None
-        chosen_keyword = result.get("chosen_keyword", keywords[0] if keywords else original_anchor)
         new_content = result.get("new_content", "")
         insert_after_sentence = result.get("insert_after_sentence", "")
@@ -469,7 +467,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
         return chosen_keyword, f"{position_text}\n\n{new_content}" if position_text else new_content
     except Exception as e:
-        print(f"[Alternative] Critical error: {e}")
         return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
@@ -483,109 +481,83 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         print("="*50)
         full_text = " ".join(blocks)
-        anchor_text_lower = anchor_text.lower() if anchor_text else ""
-        # Is the anchor anywhere in the article?
         keyword_present = _contains_anchor(full_text, anchor_text)
-        # If anchor is present, FORCE using the first block that contains it
         if keyword_present:
-            print("Anchor present in article → using anchor-first strategy.")
             anchor_block_indices = [i for i, b in enumerate(blocks) if _contains_anchor(b, anchor_text)]
             top_idx = [anchor_block_indices[0]] if anchor_block_indices else [0]
         else:
-            # No anchor present: use similarity search to choose the best block
-            print("Anchor NOT present → using similarity strategy.")
-            # Get a bit of target context for the query
-            try:
-                tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-                tt = BeautifulSoup(tgt_html, "html.parser").title
-                tgt_title = tt.get_text().strip() if tt else ""
-            except Exception as e:
-                print(f"Error fetching target URL: {e}")
-                tgt_title = ""
-            ext = tldextract.extract(target_url)
-            tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
-            query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
             try:
                 q_emb = embed([query])[0]
                 blk_embs = embed(blocks)
                 sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
                 top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
             except Exception as e:
-                print(f"Error in block embedding/similarity: {e}")
                 top_idx = [0]
         results = []
         for idx in top_idx:
-            try:
-                idx = min(idx, len(blocks)-1)
-                blk = blocks[idx]
-                # Split into sentences (also split on newlines)
-                sents = re.split(r'(?<=[.!?])\s+|\n+', blk)
-                sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
-                if not sents:
-                    sents = [blk]
-                # If anchor is present overall and in this block, pick the sentence that CONTAINS the anchor
-                best_sent = None
-                if keyword_present and _contains_anchor(blk, anchor_text):
-                    anchor_sents = [s for s in sents if _contains_anchor(s, anchor_text)]
-                    if anchor_sents:
-                        best_sent = anchor_sents[0]
-                # Otherwise, fall back to embedding-based sentence choice
-                if best_sent is None:
-                    try:
-                        # Build a lightweight query
-                        query_sent = f"{anchor_text} context"
-                        q_emb_s = embed([query_sent])[0]
-                        s_embs = embed(sents)
-                        s_sims = F.cosine_similarity(s_embs, q_emb_s.repeat(len(sents),1))
-                        si = int(torch.argmax(s_sims).item())
-                        best_sent = sents[si]
-                    except Exception as e:
-                        print(f"Error in sentence selection: {e}, using first sentence")
-                        best_sent = sents[0]
-                if not best_sent or len(best_sent.strip()) == 0:
-                    best_sent = blk if blk else "Unable to extract sentence from this section."
-                # Anchor presence in the selected sentence
-                anchor_found_in_sentence = _contains_anchor(best_sent, anchor_text)
-                rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
-                result = {
-                    "anchor_was_present": anchor_found_in_sentence,
-                    "best_sentence_original": best_sent,
-                    "best_sentence_with_anchor": rewritten_sent,
-                    "keyword_in_article": keyword_present
-                }
-                if suggest_alternative and not keyword_present:
-                    try:
-                        alt_anchor, alt_content = find_alternative_anchor(blocks, target_url, anchor_text)
-                        if alt_anchor and alt_content:
-                            result["alternative_anchor"] = alt_anchor
-                            result["alternative_sentence_original"] = ""
-                            result["alternative_sentence"] = alt_content
-                            result["alternative_exact_match"] = True
-                    except Exception as e:
-                        print(f"Error generating alternative content: {e}")
-                results.append(result)
-            except Exception as e:
-                print(f"Error processing block {idx}: {e}")
-                results.append({
-                    "anchor_was_present": False,
-                    "best_sentence_original": blocks[0] if blocks else "Error extracting content",
-                    "best_sentence_with_anchor": f"Error processing content. Please try adding the link manually: <a href='{target_url}'>{anchor_text}</a>",
-                    "keyword_in_article": keyword_present
-                })
         return results
@@ -632,35 +604,45 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
     anchor_was_present = res.get("anchor_was_present", False)
     keyword_in_article = res.get("keyword_in_article", False)
     if keyword_in_article:
-        if anchor_was_present:
-            final_html = draft_html
-        else:
-            final_html = draft_html
-            if smart_rewrite:
-                g = gpt_rewrite(final_html, anchor_text, target_url, style="neutral", language=language_name)
-                final_html = g["sentence_html"]
-            polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
-            final_html = polished.get("sentence_html", final_html)
-        final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
         result += "🔗 Add link here:\n\n"
         result += f"{final_output}"
     else:
-        final_html = draft_html
-        if smart_rewrite:
-            g = gpt_rewrite(final_html, anchor_text, target_url, style="neutral", language=language_name)
-            final_html = g["sentence_html"]
-        polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
-        final_html = polished.get("sentence_html", final_html)
-        final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
         result += "🔗 Result 1 - Suggested placement:\n\n"
         result += f"Change this sentence: {original_sentence}\n\n"
         result += f"With this one: {final_output}"
     return result
 def clear_cache():
@@ -714,10 +696,10 @@ with gr.Blocks(title=f"Link Insertion Helper • GPT: {gpt_status}") as demo:
     gr.Markdown("""
 ### Features:
-- 🌍 **Auto Language Detection**: Preserves special characters (ć, č, š, ž, đ, etc.)
-- 💾 **Smart Caching**: Caches embeddings and API responses for faster repeated queries
-- 🎯 **Anchor-First Placement**: If the anchor exists, pick the exact sentence containing it
-- 🧠 **Similarity Fallback**: If not found, suggest the most relevant sentence via embeddings
 - 🧰 **Robust Extraction**: Trafilatura + BS4; optional PDF/Cloudflare handling
 """)

 import os, re, json, requests, urllib.parse, hashlib, html
 from functools import lru_cache
+from typing import List, Optional, Tuple
 # Torch / Transformers
 import torch, torch.nn.functional as F
     )
 }
+# --- OpenAI settings (simplified for GPT-5) ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")   # simplified per your request
 FALLBACK_OPENAI_MODEL  = "gpt-4o-mini"
 OPENAI_CHAT_URL        = "https://api.openai.com/v1/chat/completions"
         print(f"get_text_blocks fatal: {e}")
         return []
+# -------- target context helpers --------
+def get_target_context(url: str) -> Tuple[str, str, str, List[str]]:
+    """
+    Return (title, meta_description, h1, content_blocks)
+    """
+    title = ""; meta = ""; h1 = ""; blocks: List[str] = []
+    try:
+        r = _fetch_bytes(url)
+        if not r:
+            return title, meta, h1, blocks
+        soup = BeautifulSoup(r.text, "html.parser")
+        if soup.title and soup.title.get_text():
+            title = soup.title.get_text().strip()
+        md = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property":"og:description"})
+        if md and md.get("content"):
+            meta = md["content"].strip()
+        h1_tag = soup.find("h1")
+        if h1_tag:
+            h1 = h1_tag.get_text(" ", strip=True)
+    except Exception as e:
+        print(f"[target] soup err: {e}")
+    # text blocks via trafilatura/BS4 too
+    tb = get_text_blocks(url, max_paragraphs=6)
+    if tb:
+        blocks = tb
+    return title, meta, h1, blocks
+def keyword_fallback_from_title_domain(title: str, url: str) -> List[str]:
+    ext = tldextract.extract(url)
+    brand = (ext.domain or "").replace("-", " ").strip()
+    base = []
+    if title:
+        t = _norm(title)
+        # crude noun-ish picks: split and keep non-trivial tokens
+        tokens = [w for w in t.split() if len(w) >= 4]
+        base.extend(tokens[:6])
+    # domain derived guesses
+    if brand:
+        base.extend([brand, f"{brand} reviews", f"{brand} guide"])
+    # simple dedupe
+    seen = set()
+    out = []
+    for k in base:
+        k2 = k.strip()
+        if k2 and k2 not in seen:
+            out.append(k2)
+            seen.add(k2)
+    # some generic fallbacks if still empty
+    if not out:
+        out = ["learn more", "full guide", "product details"]
+    return out[:8]
 # =========================
 # Embedding helpers
 # =========================
 def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     if not sentence or not anchor_text:
         return sentence, False
     try:
         pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
         if pattern.search(sentence):
             return result, True
     except Exception:
         pass
+    # else append a natural clause (no em dashes)
     if len(sentence) > 0 and sentence[-1] in '.!?':
         base, punct = sentence[:-1], sentence[-1]
     else:
         base, punct = sentence, '.'
+    rewritten = f'{base} {anchor_text}.' if anchor_text.lower().startswith("http") else f'{base} <a href="{target_url}">{anchor_text}</a>{punct}'
     return rewritten, False
 # =========================
+# OpenAI helpers (SIMPLE BODY for GPT-5)
 # =========================
+def _openai_chat_simple(model_name: str, system: str, user_json: dict):
+    """
+    Minimal body: model + messages only (no response_format / max_tokens etc.)
+    """
     if not OPENAI_API_KEY:
         raise RuntimeError("OPENAI_API_KEY not set")
     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
     body = {
         "model": model_name,
         "messages": [
             {"role": "system", "content": system},
+            {"role": "user", "content": json.dumps(user_json, ensure_ascii=False)}
+        ]
     }
     r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
     print(f"[GPT] Model={model_name} HTTP {r.status_code}")
     r.raise_for_status()
     txt = r.json()["choices"][0]["message"]["content"]
+    try:
+        return json.loads(txt)
+    except Exception:
+        # if model returns plain text, wrap it
+        return {"text": txt}
+def _openai_chat_cached(cache_key: str, model_name: str, system: str, user_json: dict):
+    if cache_key in API_RESPONSE_CACHE:
+        print(f"[GPT] Using cached response for {cache_key[:8]}...")
+        return API_RESPONSE_CACHE[cache_key]
     try:
+        result = _openai_chat_simple(model_name, system, user_json)
     except Exception as e:
         print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
+        result = _openai_chat_simple(FALLBACK_OPENAI_MODEL, system, user_json)
+    API_RESPONSE_CACHE[cache_key] = result
+    return result
+def gpt_rewrite(sentence_html, anchor_text, target_url, language="English", target_context: str = ""):
+    """
+    Target-aware rewrite. No 'avoid click here' restriction (supports generic anchors).
+    """
     if not OPENAI_API_KEY:
         return {"sentence_html": sentence_html}
+    cache_key = hashlib.md5(f"rw_{sentence_html}{anchor_text}{target_url}{language}{target_context}".encode()).hexdigest()
     system = (
+        f"You are a precise editor writing in {language}. "
+        "Integrate the provided anchor naturally into the sentence (or add a short clause). "
+        "Keep tone and length similar; no em dashes. Return JSON with key 'sentence_html' only."
     )
     user = {
+        "task": "rewrite_for_link_insertion",
         "sentence_html": sentence_html,
         "anchor_text": anchor_text,
         "target_url": target_url,
+        "target_context": target_context,
+        "language": language
     }
+    obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+    return {"sentence_html": obj.get("sentence_html", obj.get("text", sentence_html))}
+def gpt_get_search_keywords_from_context(ctx_text: str, target_url: str) -> List[str]:
     if not OPENAI_API_KEY:
+        return []
+    cache_key = hashlib.md5(f"kw_{target_url}_{ctx_text[:600]}".encode()).hexdigest()
     system = (
+        "You are an SEO assistant. From the provided target page context, return 5-10 realistic keyword phrases "
+        "users would search for to find it. Return JSON {'keywords': [...] } only."
     )
+    user = {"url": target_url, "context": ctx_text}
+    obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+    return obj.get("keywords", [])
 def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
     if not OPENAI_API_KEY or not keywords:
         return None
     source_preview = " ".join(source_blocks[:3])[:500]
+    cache_key = hashlib.md5(f"gen_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
     system = (
         f"You are a skilled content writer in {language}. Given article paragraphs and keyword candidates "
+        "for a target link, do: 1) choose ONE best keyword; 2) write 1-2 natural sentences that include it "
+        "as an <a href> to target_url; 3) provide the exact source sentence AFTER WHICH to insert. "
         "Return JSON keys: chosen_keyword, new_content, insert_after_sentence."
     )
     user = {
         "article_paragraphs": source_blocks[:7],
         "available_keywords": keywords,
         "target_url": target_url,
+        "language": language
     }
+    obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+    return obj
 def to_plain_text(html_or_text: str) -> str:
     text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
     return html.unescape(text)
 # =========================
+# Core logic (ANCHOR-FIRST + TARGET-AWARE)
 # =========================
+def build_target_context_string(target_url: str) -> str:
+    title, meta, h1, blocks = get_target_context(target_url)
+    ctx_parts = []
+    if title: ctx_parts.append(f"Title: {title}")
+    if meta:  ctx_parts.append(f"Meta: {meta}")
+    if h1:    ctx_parts.append(f"H1: {h1}")
+    if blocks: ctx_parts.append("Body: " + " ".join(blocks[:3]))
+    return "\n".join(ctx_parts)[:2000]
 def find_alternative_anchor(blocks, target_url, original_anchor):
     try:
+        ctx = build_target_context_string(target_url)
+        print(f"[Alt] Target context len={len(ctx)}")
+        keywords = gpt_get_search_keywords_from_context(ctx, target_url)
+        if not keywords:
+            # Heuristic fallback from title/domain if GPT/ctx is weak
+            title, _, _, _ = get_target_context(target_url)
+            keywords = keyword_fallback_from_title_domain(title, target_url)
+        if not keywords:
             return None, None
         source_text = " ".join(blocks[:2])
+        language_name = get_language_name(detect_language(source_text))
         result = gpt_generate_content_with_keyword(
             source_blocks=blocks,
         if not result:
             return None, None
+        chosen_keyword = result.get("chosen_keyword", keywords[0])
         new_content = result.get("new_content", "")
         insert_after_sentence = result.get("insert_after_sentence", "")
         return chosen_keyword, f"{position_text}\n\n{new_content}" if position_text else new_content
     except Exception as e:
+        print(f"[Alt] Critical error: {e}")
         return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
         print("="*50)
         full_text = " ".join(blocks)
         keyword_present = _contains_anchor(full_text, anchor_text)
+        # Build target-aware query
+        t_title, t_meta, t_h1, _ = get_target_context(target_url)
+        ext = tldextract.extract(target_url)
+        tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+        query = f"{anchor_text} — relevant to: {t_title or t_h1} | {t_meta} ({tgt_domain})"
+        # Choose candidate block indices
         if keyword_present:
+            print("Anchor present → use the first block containing it.")
             anchor_block_indices = [i for i, b in enumerate(blocks) if _contains_anchor(b, anchor_text)]
             top_idx = [anchor_block_indices[0]] if anchor_block_indices else [0]
         else:
+            print("Anchor NOT present → similarity search with target context.")
             try:
                 q_emb = embed([query])[0]
                 blk_embs = embed(blocks)
                 sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
                 top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
             except Exception as e:
+                print(f"Similarity error: {e}")
                 top_idx = [0]
         results = []
         for idx in top_idx:
+            idx = min(idx, len(blocks)-1)
+            blk = blocks[idx]
+            # Split into sentences (also split on newlines)
+            sents = re.split(r'(?<=[.!?])\s+|\n+', blk)
+            sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
+            if not sents:
+                sents = [blk]
+            # If anchor is present in block, pick the sentence that contains it
+            best_sent = None
+            if keyword_present and _contains_anchor(blk, anchor_text):
+                anchor_sents = [s for s in sents if _contains_anchor(s, anchor_text)]
+                if anchor_sents:
+                    best_sent = anchor_sents[0]
+            # Otherwise, choose via sentence-level similarity against target-aware mini query
+            if best_sent is None:
+                try:
+                    q_emb_s = embed([f"{anchor_text} {t_title} {t_h1}"])[0]
+                    s_embs = embed(sents)
+                    s_sims = F.cosine_similarity(s_embs, q_emb_s.repeat(len(sents),1))
+                    si = int(torch.argmax(s_sims).item())
+                    best_sent = sents[si]
+                except Exception as e:
+                    print(f"Sentence selection error: {e}")
+                    best_sent = sents[0]
+            if not best_sent or len(best_sent.strip()) == 0:
+                best_sent = blk
+            # Inject anchor (or append clause)
+            rewritten_sent, _ = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
+            result = {
+                "anchor_was_present": _contains_anchor(best_sent, anchor_text),
+                "best_sentence_original": best_sent,
+                "best_sentence_with_anchor": rewritten_sent,
+                "keyword_in_article": keyword_present
+            }
+            # Alternative anchor & content
+            if suggest_alternative and not keyword_present:
+                alt_anchor, alt_content = find_alternative_anchor(blocks, target_url, anchor_text)
+                if alt_anchor and alt_content:
+                    result["alternative_anchor"] = alt_anchor
+                    result["alternative_sentence_original"] = ""
+                    result["alternative_sentence"] = alt_content
+                    result["alternative_exact_match"] = True
+            results.append(result)
         return results
     anchor_was_present = res.get("anchor_was_present", False)
     keyword_in_article = res.get("keyword_in_article", False)
+    final_html = draft_html
+    if smart_rewrite:
+        # Pass target context to the rewrite so it aligns with the target page topic
+        ctx = build_target_context_string(target_url)
+        g = gpt_rewrite(final_html, anchor_text, target_url, language=language_name, target_context=ctx)
+        final_html = g["sentence_html"]
+    final_output = to_plain_text(final_html) if plain_text else final_html
     if keyword_in_article:
         result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
         result += "🔗 Add link here:\n\n"
         result += f"{final_output}"
     else:
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
         result += "🔗 Result 1 - Suggested placement:\n\n"
         result += f"Change this sentence: {original_sentence}\n\n"
         result += f"With this one: {final_output}"
+        # Show alternative if available
+        if suggest_alternative_anchor and res.get("alternative_anchor"):
+            alt_anchor = res["alternative_anchor"]
+            alt_content = res.get("alternative_sentence", "")
+            if alt_content:
+                if "[Insert after:" in alt_content:
+                    parts = alt_content.split("\n\n", 1)
+                    position_info = parts[0] if len(parts) > 0 else ""
+                    actual_content = parts[1] if len(parts) > 1 else alt_content
+                else:
+                    position_info = ""
+                    actual_content = alt_content
+                alt_output = to_plain_text(actual_content) if plain_text else actual_content
+                result += f"\n\n{'='*50}\n\n"
+                result += "🔗 Result 2 - Suggested new anchor and placement:\n"
+                result += f"💡 Using keyword: '{alt_anchor}'\n"
+                if position_info and "[Insert after:" in position_info:
+                    result += f"📍 {position_info}\n"
+                result += f"\n{alt_output}"
     return result
 def clear_cache():
     gr.Markdown("""
 ### Features:
+- 🌍 **Auto Language Detection** (ć, č, š, ž, đ preserved)
+- 🎯 **Anchor-First** if present; otherwise **Target-Aware** similarity
+- 🧠 **Target-Aware Rewrite** (uses title/meta/H1/body from the target page)
+- 🔄 **Alternative Anchor** with GPT + heuristic fallback (always tries to return Result 2)
 - 🧰 **Robust Extraction**: Trafilatura + BS4; optional PDF/Cloudflare handling
 """)