Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 23, 2025

Commit

3d8425d

verified ·

1 Parent(s): 3840a00

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -213

app.py CHANGED Viewed

@@ -13,8 +13,9 @@ UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,
 # --- OpenAI settings ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # add in HF Spaces: Settings → Variables & secrets
-PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o")  # preferred model
-FALLBACK_OPENAI_MODEL  = "gpt-4o-mini"                        # automatic fallback
 OPENAI_CHAT_URL        = "https://api.openai.com/v1/chat/completions"
 # =========================
@@ -66,53 +67,28 @@ def embed(texts):
         out = enc(**batch)
     return mean_pool(out.last_hidden_state, batch["attention_mask"])
-# ---------- Fallback: integrate anchor mid-sentence (no em-dash, no clichés, neutral nouns)
 def inject_anchor_into_sentence(sentence, anchor_text, target_url):
-    """
-    Wrap anchor if present; otherwise integrate mid-sentence with a neutral preposition.
-    No em-dash. Avoid CTA clichés. Do not assert target content type.
-    Prefer 'Related resource' add-after if sentence begins with 'This guide' etc.
-    """
     def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
     n_sent, n_anchor = norm(sentence), norm(anchor_text)
-    # If sentence clearly has its own subject ("This guide", "Our platform", "Base Casino"), prefer add-after
-    if n_sent.startswith("this guide") or n_sent.startswith("our platform") or n_sent.startswith("base casino"):
-        html = sentence
-        add_after = f' Related resource: <a href="{target_url}">{anchor_text}</a>.'
-        return html + add_after, False
-    # 1) If anchor words already present, wrap them
     if n_anchor and n_anchor in n_sent:
         pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
         return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
-    # 2) Otherwise, insert "at/on/from <a>anchor</a>" near a suitable noun
-    insert_html = f'<a href="{target_url}">{anchor_text}</a>'
-    m = re.search(r'\b(games?|content|options?|features?|benefits?)\b', sentence, flags=re.I)
-    if m:
-        idx = m.start()
-        return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
-    # after first comma
-    m2 = re.search(r',\s*', sentence)
-    if m2:
-        idx = m2.end()
-        return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
-    # around "to"
-    m3 = re.search(r'\bto\b', sentence, flags=re.I)
-    if m3:
-        idx = m3.start()
-        return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
-    # last resort: short neutral phrase
     if sentence.endswith(('.', '!', '?')):
         base, punct = sentence[:-1], sentence[-1]
     else:
         base, punct = sentence, '.'
-    rewritten = f'{base} with additional context available at {insert_html}{punct}'
     return rewritten, False
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
@@ -120,21 +96,17 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
     if not blocks:
         return [{"error":"No text blocks found on the page."}]
-    # -------- target context (title + meta desc)
     try:
         tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-        soup_tgt = BeautifulSoup(tgt_html, "html.parser")
-        tt = soup_tgt.title.get_text().strip() if soup_tgt.title else ""
-        md = soup_tgt.find("meta", attrs={"name": "description"})
-        tgt_desc = (md.get("content") or "").strip() if md else ""
-        tgt_title = tt
     except Exception:
-        tgt_title, tgt_desc = "", ""
     ext = tldextract.extract(target_url)
     tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
-    # NOTE: internal query string only (not shown to users)
     query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
     q_emb = embed([query])[0]
@@ -144,7 +116,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
     results = []
     for idx in top_idx:
-        blk = blocks[idx]                   # full paragraph
         sents = re.split(r'(?<=[.!?])\s+', blk)
         s_embs = embed(sents)
         s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
@@ -154,87 +126,12 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
         results.append({
             "anchor_was_present": exact_found,
             "best_sentence_original": best_sent,
-            "best_sentence_with_anchor": rewritten_sent,
-            "best_paragraph": blk,
-            "tgt_title": tgt_title,
-            "tgt_desc": tgt_desc
         })
     return results
-# ---------- Plain-text helper (preserve spacing between tags)
-def to_plain_text(html_or_text):
-    return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
-# ---------- Distortion / safety helpers
-def detect_primary_brand(paragraph: str) -> str:
-    """
-    Heuristic: catch brand phrases like 'Base Casino', 'Acme Platform', 'Something App'.
-    Returns lowercased brand phrase or ''.
-    """
-    p = paragraph.strip()
-    m = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\s+(Casino|Platform|Site|Service|App)\b', p)
-    if m:
-        return (m.group(0)).lower()
-    m2 = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b', p)
-    return m2.group(0).lower() if m2 else ""
-def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, anchor_text: str, paragraph_text: str = "") -> bool:
-    """
-    True if the rewrite likely misattributes the subject or positions the anchor as the mechanism.
-    Also flags if the anchor appears before the paragraph's main brand or too early overall,
-    or if it introduces content-type nouns that weren't present in the original.
-    """
-    plain_rewrite = BeautifulSoup(rewritten_html, "html.parser").get_text(" ").strip().lower()
-    plain_orig    = original_text.strip().lower()
-    a = anchor_text.strip().lower()
-    brand = detect_primary_brand(paragraph_text)
-    if brand and a in plain_rewrite:
-        pos_a = plain_rewrite.find(a)
-        pos_b = plain_rewrite.find(brand)
-        if pos_b != -1 and pos_a != -1 and pos_a < pos_b:
-            return True  # anchor introduced before the paragraph’s brand
-    # Anchor appears very early -> often implies subject shift
-    if a in plain_rewrite:
-        pos = plain_rewrite.find(a)
-        if pos != -1 and pos <= max(4, int(0.20 * len(plain_rewrite))):
-            return True
-    # Anchor as the mechanism or double "at"
-    mechanism_patterns = [
-        rf'\bthrough\s+{re.escape(a)}\b',
-        rf'\bvia\s+{re.escape(a)}\b',
-        rf'\bat\s+{re.escape(a)}\s+at\b',
-        rf'\bon\s+{re.escape(a)}\s+at\b',
-    ]
-    for pat in mechanism_patterns:
-        if re.search(pat, plain_rewrite):
-            return True
-    # Re-attribute authorship/hosting to anchor
-    bad_hosting = [
-        rf'(this|the)\s+guide\s+(at|on|from)\s+{re.escape(a)}\b',
-        rf'\b{re.escape(a)}\b\s+(explains|shows|details|covers)\b',
-        r'\b(guide|article|post|review)\s+(at|on|from)\s+',
-    ]
-    for pat in bad_hosting:
-        if re.search(pat, plain_rewrite):
-            return True
-    # Introducing content-type nouns when not present in original
-    content_nouns = ["guide", "article", "post", "review", "platform", "site", "resource"]
-    if any(n in plain_rewrite for n in content_nouns) and not any(n in plain_orig for n in content_nouns):
-        return True
-    return False
-def build_related_resource_line(target_url: str, anchor_text: str, plain_text=False) -> str:
-    html = f'Related resource: <a href="{target_url}">{anchor_text}</a>.'
-    return to_plain_text(html) if plain_text else html
 # =========================
-# GPT rewrite (editorial with paragraph context; can choose inline vs add-after)
 # =========================
 def _openai_chat(model_name: str, system: str, user_json: dict):
     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
@@ -253,78 +150,61 @@ def _openai_chat(model_name: str, system: str, user_json: dict):
     txt = r.json()["choices"][0]["message"]["content"]
     return json.loads(txt)
-def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_url, tgt_title, tgt_desc):
     """
-    Sends FULL PARAGRAPH + CHOSEN SENTENCE + TARGET METADATA to GPT.
-    GPT must return:
-      - mode: "inline" or "add_after"
-      - sentence_html (required if mode=inline)
-      - add_after_html (required if mode=add_after)
-    Enforces: no em-dash, no CTA clichés, neutral attribution unless metadata allows.
     """
     if not OPENAI_API_KEY:
-        print("[GPT] No OPENAI_API_KEY found → using fallback inline.")
-        return {"mode": "inline", "sentence_html": chosen_sentence}
-    # Determine which content-type nouns are allowed based on metadata
-    meta = f"{tgt_title} {tgt_desc}".lower()
-    allowed_nouns = [w for w in ["guide","article","blog","review","platform","site","resource"] if w in meta]
     system = (
-        "You are a professional content editor.\n"
-        "You receive the full paragraph, the chosen sentence, the anchor text, the target URL, and target metadata.\n"
-        "Decide the safest strategy:\n"
-        "A) inline — produce ONE rewritten version of the chosen sentence with the anchor integrated mid-sentence, "
-        "not at the end, preserving the paragraph’s subject/scope. Use clear, publication-quality English.\n"
-        "B) add_after — if an inline rewrite would distort the meaning or re-attribute authorship/hosting to the anchor site, "
-        "leave the sentence unchanged and instead output a short neutral line to add after the paragraph.\n\n"
-        "HARD RULES:\n"
-        "1) If inline: include an <a href> with the EXACT anchor text; keep length close; no em-dash; avoid 'for details', "
-        "'click here', 'learn more', 'visit', 'read more', 'via', 'through'. Do NOT present the anchor as the mechanism "
-        "for the action (never 'through ANCHOR', 'via ANCHOR'). Prefer neutral adjuncts like 'also at', 'with context at', "
-        "'additional information at', or 'resources at' before the anchor. Place the anchor within the first 70% of the sentence "
-        "but after the paragraph’s brand/subject.\n"
-        "2) If add_after: return a single short line like 'Related resource: <a href=\"URL\">ANCHOR</a>.' "
-        "(12–14 words max, neutral tone).\n\n"
-        "OUTPUT JSON ONLY with keys: mode ('inline'|'add_after'), sentence_html (if inline), add_after_html (if add_after)."
     )
     user = {
-        "paragraph_text": paragraph_text,
-        "chosen_sentence": chosen_sentence,
         "anchor_text": anchor_text,
         "target_url": target_url,
-        "target_metadata": {"title": tgt_title, "description": tgt_desc},
-        "allowed_nouns_from_metadata": allowed_nouns,
         "constraints": {
-            "avoid": [
-                "for details", "click here", "learn more", "visit", "read more",
-                "via", "through", "—", "--", " - "
-            ],
-            "preferred_connectors": ["at", "on", "from", "in"],
-            "place_anchor": "inside_first_70_percent"
         }
     }
     try:
         obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
     except Exception as e:
         print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
         try:
             obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
         except Exception as e2:
-            print(f"[GPT] Fallback failed: {e2}. Using inline fallback.")
-            return {"mode": "inline", "sentence_html": chosen_sentence}
-    # Normalize output
-    mode = obj.get("mode", "inline")
-    if mode not in ("inline", "add_after"):
-        mode = "inline"
-    return {
-        "mode": mode,
-        "sentence_html": obj.get("sentence_html", ""),
-        "add_after_html": obj.get("add_after_html", "")
-    }
 # =========================
 # Gradio UI
@@ -345,47 +225,25 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
     if "error" in res:
         return f"❌ {res['error']}"
-    draft_html    = res["best_sentence_with_anchor"]
-    orig_sentence = res["best_sentence_original"]
-    paragraph     = res["best_paragraph"]
-    tgt_title     = res.get("tgt_title", "")
-    tgt_desc      = res.get("tgt_desc", "")
-    # Optional conservative rule: force add-after for "This guide ..."
-    # if orig_sentence.strip().lower().startswith("this guide"):
-    #     add_after = build_related_resource_line(target_url, anchor_text, plain_text)
-    #     return warn + "Add this mini-line after the paragraph:\n\n" + add_after
     if smart_rewrite:
-        # Ask GPT to decide: inline vs add-after (with full paragraph context)
-        decision = gpt_decide_and_rewrite(paragraph, orig_sentence, anchor_text, target_url, tgt_title, tgt_desc)
-        mode = decision.get("mode", "inline")
-        if mode == "inline":
-            final_html = decision.get("sentence_html", "") or draft_html
-            # Safety gate: reject if it would distort meaning
-            if rewrite_would_distort_meaning(orig_sentence, final_html, anchor_text, paragraph):
-                add_after = build_related_resource_line(target_url, anchor_text, plain_text)
-                return warn + "Add this mini-line after the paragraph (to avoid changing its meaning):\n\n" + add_after
-            final_output = to_plain_text(final_html) if plain_text else final_html
-            # We propose a replacement to ensure the exact integrated version is used
-            return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
-        else:  # add_after
-            add_line = decision.get("add_after_html") or build_related_resource_line(target_url, anchor_text, False)
-            add_line_out = to_plain_text(add_line) if plain_text else add_line
-            return warn + "Add this mini-line after the paragraph:\n\n" + add_line_out
     else:
-        # No GPT: use heuristic inline fallback already injected in draft_html
-        final_output = to_plain_text(draft_html) if plain_text else draft_html
-        if res.get("anchor_was_present", False):
-            return warn + f"✅ Add link here:\n\n{final_output}"
-        else:
-            return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
-# Show GPT status / model in the header
 gpt_status = "ON" if OPENAI_API_KEY else "OFF"
 title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
@@ -400,7 +258,7 @@ demo = gr.Interface(
     ],
     outputs=gr.Textbox(label="Result", lines=12),
     title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
-    description="Chooses safe inline rewrite vs neutral add-after using full paragraph context. Toggle GPT and Plain text (no URL) as needed."
 )
 if __name__ == "__main__":

 # --- OpenAI settings ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # add in HF Spaces: Settings → Variables & secrets
+# Preferred model (you asked for “the new 5”): try it first, fallback to a widely-available fast model
+PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o")     # change here if you like
+FALLBACK_OPENAI_MODEL  = "gpt-4o-mini"                            # automatic fallback
 OPENAI_CHAT_URL        = "https://api.openai.com/v1/chat/completions"
 # =========================
         out = enc(**batch)
     return mean_pool(out.last_hidden_state, batch["attention_mask"])
 def inject_anchor_into_sentence(sentence, anchor_text, target_url):
+    """Wrap anchor if present; otherwise integrate link smoothly (no em-dash, no clichés)."""
     def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
     n_sent, n_anchor = norm(sentence), norm(anchor_text)
     if n_anchor and n_anchor in n_sent:
         pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
         return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
+    # Build a natural, short integration clause (no em-dash)
     if sentence.endswith(('.', '!', '?')):
         base, punct = sentence[:-1], sentence[-1]
     else:
         base, punct = sentence, '.'
+    clause_options = [
+        f' with insights from <a href="{target_url}">{anchor_text}</a>',
+        f' through <a href="{target_url}">{anchor_text}</a>',
+        f' via <a href="{target_url}">{anchor_text}</a>',
+    ]
+    clause = clause_options[0]
+    rewritten = f'{base}{clause}{punct}'
     return rewritten, False
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
     if not blocks:
         return [{"error":"No text blocks found on the page."}]
+    # target context
     try:
         tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+        tt = BeautifulSoup(tgt_html, "html.parser").title
+        tgt_title = tt.get_text().strip() if tt else ""
     except Exception:
+        tgt_title = ""
     ext = tldextract.extract(target_url)
     tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
     query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
     q_emb = embed([query])[0]
     results = []
     for idx in top_idx:
+        blk = blocks[idx]
         sents = re.split(r'(?<=[.!?])\s+', blk)
         s_embs = embed(sents)
         s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
         results.append({
             "anchor_was_present": exact_found,
             "best_sentence_original": best_sent,
+            "best_sentence_with_anchor": rewritten_sent
         })
     return results
 # =========================
+# GPT rewrite (editorial, no em-dash, no clichés)
 # =========================
 def _openai_chat(model_name: str, system: str, user_json: dict):
     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
     txt = r.json()["choices"][0]["message"]["content"]
     return json.loads(txt)
+def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral"):
     """
+    Stronger editorial rewrite:
+    - Integrates the anchor naturally (subject/object/prepositional phrase)
+    - No em-dash; avoid “for details / click here / learn more / visit / read more”
+    - Returns: {"sentence_html": "<final html>"}
     """
     if not OPENAI_API_KEY:
+        print("[GPT] No OPENAI_API_KEY found → using fallback.")
+        return {"sentence_html": sentence_html}
     system = (
+        "You are a skilled content editor. Improve fluency and integrate the given anchor naturally "
+        "into ONE sentence of similar length. Use clear, publication-quality English. "
+        "STRICT RULES: (1) Include an <a href> tag that uses the EXACT anchor text. "
+        "(2) Do NOT use an em dash or any dash. "
+        '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
+        "Prefer integrating the anchor as part of the sentence (subject, object, or prepositional phrase), "
+        "e.g., “with insights from <a ...>ANCHOR</a>”, “through <a ...>ANCHOR</a>”, or “via <a ...>ANCHOR</a>”. "
+        "Return a compact JSON object with key sentence_html only. No extra keys, no markdown."
     )
     user = {
+        "task": "rewrite_for_link_insertion",
+        "sentence_html": sentence_html,
         "anchor_text": anchor_text,
         "target_url": target_url,
+        "style": style,
         "constraints": {
+            "max_extra_words": 20,
+            "avoid": ["for details", "click here", "learn more", "visit", "read more", "—", "--", " - "]
         }
     }
+    # Try preferred model first, then fallback if needed
     try:
+        print("[GPT] Calling OpenAI Chat Completions with preferred model…")
         obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
     except Exception as e:
         print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
         try:
             obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
         except Exception as e2:
+            print(f"[GPT] Fallback failed: {e2}. Using fallback sentence.")
+            return {"sentence_html": sentence_html}
+    out = obj.get("sentence_html", sentence_html)
+    # Safety: ensure the anchor words are present (model must not drop the anchor)
+    if anchor_text.lower() not in BeautifulSoup(out, "html.parser").get_text().lower():
+        return {"sentence_html": sentence_html}
+    return {"sentence_html": out}
+# ---------- Plain-text helper (preserve spacing between tags)
+def to_plain_text(html_or_text):
+    return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
 # =========================
 # Gradio UI
     if "error" in res:
         return f"❌ {res['error']}"
+    draft_html = res["best_sentence_with_anchor"]
+    # Optionally pass through GPT for a cleaner sentence
     if smart_rewrite:
+        g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral")
+        final_html = g["sentence_html"]
+    else:
+        final_html = draft_html
+    # Optionally convert to plain text (no <a>, no tags)
+    final_output = to_plain_text(final_html) if plain_text else final_html
+    if res.get("anchor_was_present", False):
+        return warn + f"✅ Add link here:\n\n{final_output}"
     else:
+        original_sentence = res['best_sentence_original']
+        return warn + f"Change this sentence:\n\n{original_sentence}\n\nWith this one:\n\n{final_output}"
+# Show GPT status in the header
 gpt_status = "ON" if OPENAI_API_KEY else "OFF"
 title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
     ],
     outputs=gr.Textbox(label="Result", lines=12),
     title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
+    description="Suggests the best place to add your link and returns one clean instruction. Toggle GPT and Plain text (no URL) as needed."
 )
 if __name__ == "__main__":