Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Build error

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

b9459ca

verified ·

1 Parent(s): d3ac3e5

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -50

app.py CHANGED Viewed

@@ -261,35 +261,13 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     rewritten = f'{base}{clause}{punct}'
     return rewritten, False
-def find_alternative_anchor(blocks, target_url, original_anchor):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
-        # Get target page context
-        try:
-            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-            soup = BeautifulSoup(tgt_html, "html.parser")
-            # Extract target page title and meta description
-            title = soup.title.get_text().strip() if soup.title else ""
-            meta_desc = ""
-            meta_tag = soup.find("meta", attrs={"name": "description"})
-            if meta_tag:
-                meta_desc = meta_tag.get("content", "")
-            # Extract key terms from target page (first few paragraphs)
-            target_paragraphs = []
-            for p in soup.find_all("p")[:5]:
-                text = p.get_text().strip()
-                if len(text) > 50:
-                    target_paragraphs.append(text)
-            target_content = " ".join(target_paragraphs[:3])
-        except Exception as e:
-            print(f"Error fetching target URL: {e}")
-            title = ""
-            meta_desc = ""
-            target_content = original_anchor
         # Extract all potential anchor phrases from the source article
         all_phrases = set()
         full_text = " ".join(blocks)
@@ -314,7 +292,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
                     if i < 0 or i+length > len(words):
                         continue
                     phrase = ' '.join(words[i:i+length])
-                    phrase_clean = phrase.strip('.,!?;:"\' ')
                     # Check if phrase is meaningful
                     if i < len(words) and i+length-1 < len(words):
@@ -330,7 +308,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
             # Also extract single important words (proper nouns, long words)
             for word in words:
-                clean_word = word.strip('.,!?;:"\' ')
                 if clean_word and (len(clean_word) > 6 or
                     (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
                     all_phrases.add(clean_word)
@@ -338,12 +316,12 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
         if not all_phrases:
             return None, None
-        # Create context query from target URL info
-        target_context = f"{title} {meta_desc} {target_content}"[:500]
         # Score each phrase based on relevance to target
         try:
-            target_emb = embed([target_context])[0]
         except:
             return None, None
@@ -362,6 +340,10 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
                 phrase_emb = embed([phrase])[0]
                 relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
                 # Check if this phrase appears in article and find its best context
                 if phrase.lower() in full_text.lower():
                     # Find sentences containing this phrase
@@ -376,22 +358,146 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
                                         context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
                                         combined_score = (relevance_score * 0.6) + (context_score * 0.4)
-                                        if combined_score > best_score:
                                             best_score = combined_score
                                             best_anchor = phrase
                                             best_sentence = sent
                                     except:
                                         continue
             except Exception as e:
                 print(f"Error evaluating phrase '{phrase}': {e}")
                 continue
         return best_anchor, best_sentence
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
         return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
     try:
         blocks = get_text_blocks(source_url)
@@ -403,6 +509,21 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         print(f"DEBUG: Looking for anchor: '{anchor_text}'")
         print("="*50)
         # Check if keyword is present in the article
         full_text = " ".join(blocks)
         full_text_lower = full_text.lower()
@@ -434,20 +555,11 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         print(f"Keyword present in article: {keyword_present}")
-        # Target context for similarity matching
-        try:
-            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-            tt = BeautifulSoup(tgt_html, "html.parser").title
-            tgt_title = tt.get_text().strip() if tt else ""
-        except Exception as e:
-            print(f"Error fetching target URL: {e}")
-            tgt_title = ""
         ext = tldextract.extract(target_url)
         tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
-        # Find best match with original anchor
-        query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
         try:
             q_emb = embed([query])[0]
@@ -504,18 +616,31 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                 rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
                 result = {
                     "anchor_was_present": anchor_found_in_sentence,
                     "best_sentence_original": best_sent,
                     "best_sentence_with_anchor": rewritten_sent,
-                    "keyword_in_article": keyword_present
                 }
                 # If anchor not present in article and alternative suggestion requested
                 if suggest_alternative and not keyword_present:
                     try:
                         # Find a completely different anchor and sentence
-                        alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
                         if alt_anchor and alt_sentence:
                             # Create the sentence with the alternative anchor
@@ -738,6 +863,17 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
     # Check if anchor was already present in the article
     anchor_was_present = res.get("anchor_was_present", False)
     keyword_in_article = res.get("keyword_in_article", False)
     # If anchor is present in the article (even if not in the best sentence)
     if keyword_in_article:
@@ -746,8 +882,10 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             # Anchor is in the suggested sentence - just show where to add the link
             final_output = to_plain_text(draft_html) if plain_text else draft_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-            result += f"📍 Add link here:\n\n"
             result += f"{final_output}"
         else:
             # Anchor is in article but not in this sentence
             if smart_rewrite:
@@ -761,7 +899,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             final_output = to_plain_text(final_html) if plain_text else final_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-            result += f"📍 Add link here:\n\n"
             result += f"{final_output}"
     else:
         # Anchor doesn't exist in article at all - need to add it
@@ -776,7 +914,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
-        result += f"📍 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
@@ -807,7 +945,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
-                result += f"📍 Result 2 - Alternative from article:\n"
                 result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
                 result += f"Original: {alt_sentence_original}\n\n"
                 result += f"Suggested: {alt_output}"

     rewritten = f'{base}{clause}{punct}'
     return rewritten, False
+def find_alternative_anchor(blocks, target_url, original_anchor, target_context=None):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
+        # Use provided target context or analyze the URL
+        if not target_context:
+            target_context = analyze_target_url(target_url)
         # Extract all potential anchor phrases from the source article
         all_phrases = set()
         full_text = " ".join(blocks)
                     if i < 0 or i+length > len(words):
                         continue
                     phrase = ' '.join(words[i:i+length])
+                    phrase_clean = phrase.strip('.,!?;:"\'')
                     # Check if phrase is meaningful
                     if i < len(words) and i+length-1 < len(words):
             # Also extract single important words (proper nouns, long words)
             for word in words:
+                clean_word = word.strip('.,!?;:"\'')
                 if clean_word and (len(clean_word) > 6 or
                     (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
                     all_phrases.add(clean_word)
         if not all_phrases:
             return None, None
+        # Use the comprehensive target context
+        target_summary = target_context.get("summary", "")[:500]
         # Score each phrase based on relevance to target
         try:
+            target_emb = embed([target_summary])[0]
         except:
             return None, None
                 phrase_emb = embed([phrase])[0]
                 relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+                # Only consider phrases with good relevance to target (threshold)
+                if relevance_score < 0.3:  # Skip low relevance phrases
+                    continue
                 # Check if this phrase appears in article and find its best context
                 if phrase.lower() in full_text.lower():
                     # Find sentences containing this phrase
                                         context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
                                         combined_score = (relevance_score * 0.6) + (context_score * 0.4)
+                                        # Higher threshold for alternative anchors
+                                        if combined_score > best_score and combined_score > 0.35:
                                             best_score = combined_score
                                             best_anchor = phrase
                                             best_sentence = sent
+                                            print(f"  Found alternative: '{phrase}' (score: {combined_score:.3f})")
                                     except:
                                         continue
             except Exception as e:
                 print(f"Error evaluating phrase '{phrase}': {e}")
                 continue
+        if best_anchor:
+            print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
+        else:
+            print(f"\n✗ No suitable alternative anchor found with sufficient relevance to target page")
         return best_anchor, best_sentence
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
         return None, None
+def analyze_target_url(target_url):
+    """Deeply analyze the target URL to understand what the page is about."""
+    try:
+        # Try Trafilatura first for better extraction
+        downloaded = trafilatura.fetch_url(target_url)
+        target_text = trafilatura.extract(downloaded,
+                                        include_comments=False,
+                                        include_tables=False,
+                                        deduplicate=True,
+                                        output_format='txt',
+                                        favor_precision=False)
+        if not target_text:
+            # Fallback to BeautifulSoup
+            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+            soup = BeautifulSoup(tgt_html, "html.parser")
+            # Remove unwanted elements
+            for tag in soup(["script","style","noscript","header","footer","nav","aside","form"]):
+                tag.decompose()
+            # Get text content
+            target_text = soup.get_text(separator=" ", strip=True) if soup.body else ""
+        # Also get metadata
+        tgt_html = requests.get(target_url, timeout=20, headers=UA).text if not downloaded else tgt_html
+        soup = BeautifulSoup(tgt_html, "html.parser")
+        # Extract title
+        title = soup.title.get_text().strip() if soup.title else ""
+        # Extract meta description
+        meta_desc = ""
+        meta_tag = soup.find("meta", attrs={"name": "description"})
+        if meta_tag:
+            meta_desc = meta_tag.get("content", "")
+        # Extract h1-h3 headings for topic understanding
+        headings = []
+        for h in soup.find_all(['h1', 'h2', 'h3'])[:10]:
+            heading_text = h.get_text().strip()
+            if heading_text:
+                headings.append(heading_text)
+        # Create a comprehensive summary of what the page is about
+        # Take first 1500 chars of main content for context
+        main_content = target_text[:1500] if target_text else ""
+        target_context = {
+            "title": title,
+            "meta_description": meta_desc,
+            "headings": headings,
+            "main_content": main_content,
+            "full_text": target_text[:3000] if target_text else "",  # Limit for embedding
+            "summary": f"{title} {meta_desc} {' '.join(headings[:5])} {main_content[:500]}"
+        }
+        print(f"\nTarget URL Analysis:")
+        print(f"  Title: {title[:100]}")
+        print(f"  Meta: {meta_desc[:100]}")
+        print(f"  Main headings: {headings[:3]}")
+        print(f"  Content preview: {main_content[:200]}...")
+        return target_context
+    except Exception as e:
+        print(f"Error analyzing target URL: {e}")
+        return {
+            "title": "",
+            "meta_description": "",
+            "headings": [],
+            "main_content": "",
+            "full_text": "",
+            "summary": anchor_text  # Fallback to anchor text if can't analyze
+        }
+def validate_anchor_relevance(anchor_text, sentence, target_context, threshold=0.3):
+    """Check if the anchor and sentence are relevant to the target page content."""
+    try:
+        # Create embedding for target page context
+        target_summary = target_context.get("summary", "")
+        if not target_summary:
+            return True  # If we can't analyze, assume it's ok
+        # Embed target content
+        target_emb = embed([target_summary])[0]
+        # Check anchor relevance to target
+        anchor_emb = embed([anchor_text])[0]
+        anchor_relevance = F.cosine_similarity(
+            anchor_emb.unsqueeze(0),
+            target_emb.unsqueeze(0)
+        ).item()
+        # Check sentence relevance to target
+        sentence_emb = embed([sentence])[0]
+        sentence_relevance = F.cosine_similarity(
+            sentence_emb.unsqueeze(0),
+            target_emb.unsqueeze(0)
+        ).item()
+        print(f"\nRelevance scores:")
+        print(f"  Anchor '{anchor_text}' to target: {anchor_relevance:.3f}")
+        print(f"  Sentence to target: {sentence_relevance:.3f}")
+        # Return true if either anchor or sentence is relevant enough
+        is_relevant = anchor_relevance > threshold or sentence_relevance > threshold
+        if not is_relevant:
+            print(f"  ⚠️ Low relevance detected! Anchor/sentence may not match target page topic.")
+        return is_relevant, anchor_relevance, sentence_relevance
+    except Exception as e:
+        print(f"Error validating relevance: {e}")
+        return True, 0.5, 0.5  # Default to allowing if error
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
     try:
         blocks = get_text_blocks(source_url)
         print(f"DEBUG: Looking for anchor: '{anchor_text}'")
         print("="*50)
+        # ANALYZE TARGET URL FIRST - This is the key addition
+        target_context = analyze_target_url(target_url)
+        # Validate that the anchor text is relevant to the target page
+        is_relevant, anchor_score, _ = validate_anchor_relevance(
+            anchor_text,
+            anchor_text,  # Check anchor against itself first
+            target_context,
+            threshold=0.25  # Lower threshold for initial check
+        )
+        if not is_relevant and anchor_score < 0.2:
+            print(f"\n⚠️ WARNING: Anchor '{anchor_text}' seems unrelated to target page content!")
+            print(f"Target page appears to be about: {target_context['title'][:100]}")
         # Check if keyword is present in the article
         full_text = " ".join(blocks)
         full_text_lower = full_text.lower()
         print(f"Keyword present in article: {keyword_present}")
         ext = tldextract.extract(target_url)
         tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+        # Use the comprehensive target context for finding best match
+        query = f"{anchor_text} — relevant to: {target_context['summary'][:200]}"
         try:
             q_emb = embed([query])[0]
                 rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
+                # Validate the sentence relevance to target before including it
+                is_relevant, _, sent_relevance = validate_anchor_relevance(
+                    anchor_text,
+                    best_sent,
+                    target_context,
+                    threshold=0.25
+                )
                 result = {
                     "anchor_was_present": anchor_found_in_sentence,
                     "best_sentence_original": best_sent,
                     "best_sentence_with_anchor": rewritten_sent,
+                    "keyword_in_article": keyword_present,
+                    "relevance_score": sent_relevance,
+                    "is_relevant": is_relevant,
+                    "target_title": target_context.get("title", ""),
+                    "target_topic": target_context.get("meta_description", "")[:100]
                 }
                 # If anchor not present in article and alternative suggestion requested
                 if suggest_alternative and not keyword_present:
                     try:
                         # Find a completely different anchor and sentence
+                        # Pass the target_context we already analyzed
+                        alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
                         if alt_anchor and alt_sentence:
                             # Create the sentence with the alternative anchor
     # Check if anchor was already present in the article
     anchor_was_present = res.get("anchor_was_present", False)
     keyword_in_article = res.get("keyword_in_article", False)
+    relevance_score = res.get("relevance_score", 0)
+    is_relevant = res.get("is_relevant", True)
+    target_title = res.get("target_title", "")
+    target_topic = res.get("target_topic", "")
+    # Add warning if low relevance detected
+    relevance_warning = ""
+    if not is_relevant or relevance_score < 0.25:
+        relevance_warning = f"\n\n⚠️ **Warning**: The suggested content may not be highly relevant to the target page.\n"
+        relevance_warning += f"Target page appears to be about: {target_title[:100]}\n"
+        relevance_warning += f"Relevance score: {relevance_score:.2f}\n"
     # If anchor is present in the article (even if not in the best sentence)
     if keyword_in_article:
             # Anchor is in the suggested sentence - just show where to add the link
             final_output = to_plain_text(draft_html) if plain_text else draft_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"🔗 Add link here:\n\n"
             result += f"{final_output}"
+            result += relevance_warning
+            result += relevance_warning
         else:
             # Anchor is in article but not in this sentence
             if smart_rewrite:
             final_output = to_plain_text(final_html) if plain_text else final_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"🔗 Add link here:\n\n"
             result += f"{final_output}"
     else:
         # Anchor doesn't exist in article at all - need to add it
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
+        result += f"🔗 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
+                result += f"🔗 Result 2 - Alternative from article:\n"
                 result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
                 result += f"Original: {alt_sentence_original}\n\n"
                 result += f"Suggested: {alt_output}"