Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

bb074b3

verified ·

1 Parent(s): c9ce95a

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -244

app.py CHANGED Viewed

@@ -314,7 +314,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
                     if i < 0 or i+length > len(words):
                         continue
                     phrase = ' '.join(words[i:i+length])
-                    phrase_clean = phrase.strip('.,!?;:"\'')
                     # Check if phrase is meaningful
                     if i < len(words) and i+length-1 < len(words):
@@ -330,7 +330,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
             # Also extract single important words (proper nouns, long words)
             for word in words:
-                clean_word = word.strip('.,!?;:"\'')
                 if clean_word and (len(clean_word) > 6 or
                     (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
                     all_phrases.add(clean_word)
@@ -391,115 +391,6 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
         return None, None
-    except Exception as e:
-        print(f"Critical error in find_alternative_anchor: {e}")
-        return None, None
-def analyze_target_url(target_url):
-    """Deeply analyze the target URL to understand what the page is about."""
-    try:
-        # Use the same extraction logic as get_text_blocks
-        blocks = get_text_blocks(target_url, max_paragraphs=10)  # Get more content for better understanding
-        # Also get metadata separately
-        try:
-            resp = requests.get(target_url, timeout=20, headers=UA)
-            soup = BeautifulSoup(resp.text, "html.parser")
-            # Extract title
-            title = soup.title.get_text().strip() if soup.title else ""
-            # Extract meta description
-            meta_desc = ""
-            meta_tag = soup.find("meta", attrs={"name": "description"})
-            if meta_tag:
-                meta_desc = meta_tag.get("content", "")
-            # Extract h1-h3 headings for topic understanding
-            headings = []
-            for h in soup.find_all(['h1', 'h2', 'h3'])[:10]:
-                heading_text = h.get_text().strip()
-                if heading_text:
-                    headings.append(heading_text)
-        except Exception as e:
-            print(f"Error getting metadata: {e}")
-            title = ""
-            meta_desc = ""
-            headings = []
-        # Combine blocks into full text
-        full_text = " ".join(blocks) if blocks else ""
-        main_content = full_text[:1500] if full_text else ""
-        target_context = {
-            "title": title,
-            "meta_description": meta_desc,
-            "headings": headings,
-            "main_content": main_content,
-            "full_text": full_text[:3000],  # Limit for embedding
-            "summary": f"{title} {meta_desc} {' '.join(headings[:5])} {main_content[:500]}"
-        }
-        print(f"\nTarget URL Analysis:")
-        print(f"  Title: {title[:100]}")
-        print(f"  Meta: {meta_desc[:100]}")
-        print(f"  Main headings: {headings[:3]}")
-        print(f"  Extracted {len(blocks)} blocks")
-        return target_context
-    except Exception as e:
-        print(f"Error analyzing target URL: {e}")
-        return {
-            "title": "",
-            "meta_description": "",
-            "headings": [],
-            "main_content": "",
-            "full_text": "",
-            "summary": ""
-        }
-def validate_anchor_relevance(anchor_text, sentence, target_context, threshold=0.3):
-    """Check if the anchor and sentence are relevant to the target page content."""
-    try:
-        # Create embedding for target page context
-        target_summary = target_context.get("summary", "")
-        if not target_summary:
-            return True  # If we can't analyze, assume it's ok
-        # Embed target content
-        target_emb = embed([target_summary])[0]
-        # Check anchor relevance to target
-        anchor_emb = embed([anchor_text])[0]
-        anchor_relevance = F.cosine_similarity(
-            anchor_emb.unsqueeze(0),
-            target_emb.unsqueeze(0)
-        ).item()
-        # Check sentence relevance to target
-        sentence_emb = embed([sentence])[0]
-        sentence_relevance = F.cosine_similarity(
-            sentence_emb.unsqueeze(0),
-            target_emb.unsqueeze(0)
-        ).item()
-        print(f"\nRelevance scores:")
-        print(f"  Anchor '{anchor_text}' to target: {anchor_relevance:.3f}")
-        print(f"  Sentence to target: {sentence_relevance:.3f}")
-        # Return true if either anchor or sentence is relevant enough
-        is_relevant = anchor_relevance > threshold or sentence_relevance > threshold
-        if not is_relevant:
-            print(f"  ⚠️ Low relevance detected! Anchor/sentence may not match target page topic.")
-        return is_relevant, anchor_relevance, sentence_relevance
-    except Exception as e:
-        print(f"Error validating relevance: {e}")
-        return True, 0.5, 0.5  # Default to allowing if error
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
     try:
@@ -512,21 +403,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         print(f"DEBUG: Looking for anchor: '{anchor_text}'")
         print("="*50)
-        # ANALYZE TARGET URL FIRST - This is the key addition
-        target_context = analyze_target_url(target_url)
-        # Validate that the anchor text is relevant to the target page
-        is_relevant, anchor_score, _ = validate_anchor_relevance(
-            anchor_text,
-            anchor_text,  # Check anchor against itself first
-            target_context,
-            threshold=0.25  # Lower threshold for initial check
-        )
-        if not is_relevant and anchor_score < 0.2:
-            print(f"\n⚠️ WARNING: Anchor '{anchor_text}' seems unrelated to target page content!")
-            print(f"Target page appears to be about: {target_context['title'][:100]}")
         # Check if keyword is present in the article
         full_text = " ".join(blocks)
         full_text_lower = full_text.lower()
@@ -558,11 +434,20 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         print(f"Keyword present in article: {keyword_present}")
         ext = tldextract.extract(target_url)
         tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
-        # Use the comprehensive target context for finding best match
-        query = f"{anchor_text} — relevant to: {target_context['summary'][:200]}"
         try:
             q_emb = embed([query])[0]
@@ -619,23 +504,11 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                 rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
-                # Validate the sentence relevance to target before including it
-                is_relevant, _, sent_relevance = validate_anchor_relevance(
-                    anchor_text,
-                    best_sent,
-                    target_context,
-                    threshold=0.25
-                )
                 result = {
                     "anchor_was_present": anchor_found_in_sentence,
                     "best_sentence_original": best_sent,
                     "best_sentence_with_anchor": rewritten_sent,
-                    "keyword_in_article": keyword_present,
-                    "relevance_score": sent_relevance,
-                    "is_relevant": is_relevant,
-                    "target_title": target_context.get("title", ""),
-                    "target_topic": target_context.get("meta_description", "")[:100]
                 }
                 # If anchor not present in article and alternative suggestion requested
@@ -653,8 +526,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                             result["alternative_exact_match"] = alt_exact
                     except Exception as e:
                         print(f"Error finding alternative anchor: {e}")
-                        import traceback
-                        traceback.print_exc()
                         # Continue without alternative
                 results.append(result)
@@ -771,89 +642,7 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
     # Don't check for exact anchor text match as it might have special chars
     return {"sentence_html": out}
-def gpt_get_target_keywords(target_url, target_context, language="English"):
-    """Ask GPT to suggest 5-10 relevant search keywords users would use to find this page."""
-    if not OPENAI_API_KEY:
-        return []
-    # Create cache key
-    cache_key = hashlib.md5(f"keywords_{target_url}{language}".encode()).hexdigest()
-    if cache_key in API_RESPONSE_CACHE:
-        print(f"[GPT] Using cached keywords for {target_url[:30]}...")
-        return API_RESPONSE_CACHE[cache_key].get("keywords", [])
-    title = target_context.get("title", "")
-    meta = target_context.get("meta_description", "")
-    content = target_context.get("main_content", "")[:500]
-    system = (
-        "You are an SEO expert. Based on the page content provided, suggest 5-10 search keywords or phrases "
-        "that users would likely type into Google to find this page. "
-        "Include both short keywords (1-2 words) and long-tail keywords (3-5 words). "
-        "Make them realistic search terms, not just words from the page. "
-        f"Consider the {language} language and local search patterns. "
-        "Return JSON with a 'keywords' array."
-    )
-    user = {
-        "url": target_url,
-        "title": title,
-        "meta_description": meta,
-        "content_preview": content,
-        "task": "Generate search keywords users would use to find this page"
-    }
-    try:
-        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
-        keywords = obj.get("keywords", [])
-        print(f"\n[GPT] Target page keywords: {keywords}")
-        return keywords
-    except Exception as e:
-        print(f"[GPT] Error getting keywords: {e}")
-        return []
-def gpt_add_keyword_to_content(blocks, keywords, target_url, language="English"):
-    """Ask GPT to naturally add one of the keywords to the content with proper context."""
-    if not OPENAI_API_KEY or not keywords:
-        return None
-    # Create cache key
-    blocks_preview = " ".join(blocks[:3])[:500]
-    cache_key = hashlib.md5(f"add_kw_{blocks_preview}{str(keywords)}{target_url}".encode()).hexdigest()
-    if cache_key in API_RESPONSE_CACHE:
-        return API_RESPONSE_CACHE[cache_key]
-    system = (
-        f"You are a skilled content editor writing in {language}. "
-        "Your task is to naturally integrate ONE of the provided keywords into the article content. "
-        "RULES: "
-        "1. Choose the keyword that fits most naturally with the existing content "
-        "2. Add 2-3 sentences or a short paragraph that includes the keyword "
-        "3. Make it flow naturally - it should feel like it belongs there "
-        "4. Include an HTML link using the keyword as anchor text "
-        "5. Specify WHERE to add it (e.g., 'after the second paragraph', 'before the conclusion') "
-        "6. The addition should provide value, not just keyword stuffing "
-        f"7. Write in {language} and preserve special characters "
-        "Return JSON with: 'keyword_used', 'content_to_add', 'placement_instruction'"
-    )
-    user = {
-        "article_preview": " ".join(blocks[:5]),
-        "available_keywords": keywords,
-        "target_url": target_url,
-        "language": language,
-        "task": "Add one keyword naturally to the content"
-    }
-    try:
-        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
-        API_RESPONSE_CACHE[cache_key] = obj
-        return obj
-    except Exception as e:
-        print(f"[GPT] Error adding keyword: {e}")
-        return None
     """
     Final QA pass with language support.
     """
@@ -949,17 +738,6 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
     # Check if anchor was already present in the article
     anchor_was_present = res.get("anchor_was_present", False)
     keyword_in_article = res.get("keyword_in_article", False)
-    relevance_score = res.get("relevance_score", 0)
-    is_relevant = res.get("is_relevant", True)
-    target_title = res.get("target_title", "")
-    target_topic = res.get("target_topic", "")
-    # Add warning if low relevance detected
-    relevance_warning = ""
-    if not is_relevant or relevance_score < 0.25:
-        relevance_warning = f"\n\n⚠️ **Warning**: The suggested content may not be highly relevant to the target page.\n"
-        relevance_warning += f"Target page appears to be about: {target_title[:100]}\n"
-        relevance_warning += f"Relevance score: {relevance_score:.2f}\n"
     # If anchor is present in the article (even if not in the best sentence)
     if keyword_in_article:
@@ -968,10 +746,8 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             # Anchor is in the suggested sentence - just show where to add the link
             final_output = to_plain_text(draft_html) if plain_text else draft_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-            result += f"🔗 Add link here:\n\n"
             result += f"{final_output}"
-            result += relevance_warning
-            result += relevance_warning
         else:
             # Anchor is in article but not in this sentence
             if smart_rewrite:
@@ -985,7 +761,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             final_output = to_plain_text(final_html) if plain_text else final_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-            result += f"🔗 Add link here:\n\n"
             result += f"{final_output}"
     else:
         # Anchor doesn't exist in article at all - need to add it
@@ -1000,10 +776,9 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
-        result += f"🔗 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
-        result += relevance_warning
         # Show alternative if requested and available
         if suggest_alternative_anchor and res.get("alternative_anchor"):
@@ -1032,7 +807,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
-                result += f"🔗 Result 2 - Alternative from article:\n"
                 result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
                 result += f"Original: {alt_sentence_original}\n\n"
                 result += f"Suggested: {alt_output}"

                     if i < 0 or i+length > len(words):
                         continue
                     phrase = ' '.join(words[i:i+length])
+                    phrase_clean = phrase.strip('.,!?;:"\' ')
                     # Check if phrase is meaningful
                     if i < len(words) and i+length-1 < len(words):
             # Also extract single important words (proper nouns, long words)
             for word in words:
+                clean_word = word.strip('.,!?;:"\' ')
                 if clean_word and (len(clean_word) > 6 or
                     (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
                     all_phrases.add(clean_word)
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
         return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
     try:
         print(f"DEBUG: Looking for anchor: '{anchor_text}'")
         print("="*50)
         # Check if keyword is present in the article
         full_text = " ".join(blocks)
         full_text_lower = full_text.lower()
         print(f"Keyword present in article: {keyword_present}")
+        # Target context for similarity matching
+        try:
+            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+            tt = BeautifulSoup(tgt_html, "html.parser").title
+            tgt_title = tt.get_text().strip() if tt else ""
+        except Exception as e:
+            print(f"Error fetching target URL: {e}")
+            tgt_title = ""
         ext = tldextract.extract(target_url)
         tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+        # Find best match with original anchor
+        query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
         try:
             q_emb = embed([query])[0]
                 rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
                 result = {
                     "anchor_was_present": anchor_found_in_sentence,
                     "best_sentence_original": best_sent,
                     "best_sentence_with_anchor": rewritten_sent,
+                    "keyword_in_article": keyword_present
                 }
                 # If anchor not present in article and alternative suggestion requested
                             result["alternative_exact_match"] = alt_exact
                     except Exception as e:
                         print(f"Error finding alternative anchor: {e}")
                         # Continue without alternative
                 results.append(result)
     # Don't check for exact anchor text match as it might have special chars
     return {"sentence_html": out}
+def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="English"):
     """
     Final QA pass with language support.
     """
     # Check if anchor was already present in the article
     anchor_was_present = res.get("anchor_was_present", False)
     keyword_in_article = res.get("keyword_in_article", False)
     # If anchor is present in the article (even if not in the best sentence)
     if keyword_in_article:
             # Anchor is in the suggested sentence - just show where to add the link
             final_output = to_plain_text(draft_html) if plain_text else draft_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"📍 Add link here:\n\n"
             result += f"{final_output}"
         else:
             # Anchor is in article but not in this sentence
             if smart_rewrite:
             final_output = to_plain_text(final_html) if plain_text else final_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"📍 Add link here:\n\n"
             result += f"{final_output}"
     else:
         # Anchor doesn't exist in article at all - need to add it
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
+        result += f"📍 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
         # Show alternative if requested and available
         if suggest_alternative_anchor and res.get("alternative_anchor"):
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
+                result += f"📍 Result 2 - Alternative from article:\n"
                 result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
                 result += f"Original: {alt_sentence_original}\n\n"
                 result += f"Suggested: {alt_output}"