Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

d6c6bec

verified ·

1 Parent(s): 40600a4

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -106

app.py CHANGED Viewed

@@ -137,9 +137,8 @@ def is_likely_author_bio_or_footer(element, text):
     return False
 def get_text_blocks(url, max_paragraphs=7):
-    """Extract text blocks, prioritizing main content paragraphs."""
     try:
-        # Try with a more complete User-Agent that mimics a real browser
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -152,13 +151,12 @@ def get_text_blocks(url, max_paragraphs=7):
         resp = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
         resp.raise_for_status()
-        # Ensure proper encoding
         resp.encoding = resp.apparent_encoding
         soup = BeautifulSoup(resp.text, "html.parser")
     except Exception as e:
         print(f"Error fetching URL {url}: {e}")
-        # Try one more time with just the basic User-Agent
         try:
             resp = requests.get(url, timeout=20, headers=UA)
             resp.raise_for_status()
             resp.encoding = resp.apparent_encoding
@@ -171,63 +169,88 @@ def get_text_blocks(url, max_paragraphs=7):
         tag.decompose()
     blocks = []
-    paragraph_count = 0
-    # Try to find main content area - add more potential content container names
-    main_content = (
-        soup.find('main') or
-        soup.find('article') or
-        soup.find('div', class_=re.compile('content|main|article|post|entry|blog', re.I)) or
-        soup.find('div', id=re.compile('content|main|article|post|entry', re.I)) or
-        soup.find('div', class_='container') or
-        soup.find('div', role='main')
-    )
     if not main_content:
         main_content = soup.body if soup.body else soup
-    # If still no paragraphs found, try a more aggressive approach
-    elements_to_check = main_content.find_all(["p","li","h2","h3","h4","blockquote","div"])
-    for el in elements_to_check:
-        # Skip if likely author bio or footer content
-        if is_likely_author_bio_or_footer(el, el.get_text()):
             continue
-        txt = " ".join(el.get_text(" ", strip=True).split())
-        # For divs, only include if they have substantial text and no nested block elements
-        if el.name == 'div':
-            # Skip divs that contain other block elements (they're containers)
-            if el.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section']):
-                continue
-            # Only include divs with substantial text content
-            if len(txt) < 100:
                 continue
-        if len(txt) > 60:
-            # Avoid duplicate content
-            if txt not in blocks:
-                blocks.append(txt)
-                if el.name in ['p', 'div']:  # Count paragraphs and text divs
-                    paragraph_count += 1
-                    if paragraph_count >= max_paragraphs:
-                        break
-    # If we still have no blocks, try to get ANY text from the page
-    if not blocks:
-        print(f"Warning: No standard blocks found, attempting fallback extraction for {url}")
-        # Get all text from body
-        if soup.body:
-            all_text = soup.body.get_text(separator="\n")
-            # Split by newlines and filter
-            lines = all_text.split('\n')
-            for line in lines:
-                line = " ".join(line.split()).strip()
-                if len(line) > 100:  # Only substantial lines
-                    blocks.append(line)
-                    if len(blocks) >= max_paragraphs:
-                        break
     return blocks
@@ -287,15 +310,6 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     rewritten = f'{base}{clause}{punct}'
     return rewritten, False
-def create_anchor_suggestion(anchor_text, target_url):
-    """Create a suggestion for where to add the anchor when it's not found in the article."""
-    suggestions = [
-        f'Consider adding a new sentence like: "For more information on this topic, see <a href="{target_url}">{anchor_text}</a>."',
-        f'You could add: "Additional insights can be found in <a href="{target_url}">{anchor_text}</a>."',
-        f'Suggestion: "This relates to concepts discussed in <a href="{target_url}">{anchor_text}</a>."'
-    ]
-    return suggestions[0]
 def find_alternative_anchor(blocks, target_url, original_anchor):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
@@ -433,21 +447,43 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         if not blocks:
             return [{"error":"No text blocks found on the page."}]
-        # Check if keyword is present in the article - need to check case-insensitively
         full_text = " ".join(blocks)
         full_text_lower = full_text.lower()
         anchor_text_lower = anchor_text.lower() if anchor_text else ""
-        keyword_present = anchor_text_lower in full_text_lower
-        # Also check with normalized text (removing special chars)
         if not keyword_present:
-            # Try normalized search
-            import re
-            normalized_text = re.sub(r'[^a-z0-9\s]', '', full_text_lower)
-            normalized_anchor = re.sub(r'[^a-z0-9\s]', '', anchor_text_lower)
-            keyword_present = normalized_anchor in normalized_text
-        # target context
         try:
             tgt_html = requests.get(target_url, timeout=20, headers=UA).text
             tt = BeautifulSoup(tgt_html, "html.parser").title
@@ -459,8 +495,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         ext = tldextract.extract(target_url)
         tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
-        # First, find best match with original anchor
-        query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
         try:
             q_emb = embed([query])[0]
@@ -475,7 +511,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         results = []
         for idx in top_idx:
             try:
-                blk = blocks[min(idx, len(blocks)-1)]  # Ensure valid index
                 # Split sentences more carefully
                 sents = re.split(r'(?<=[.!?])\s+', blk)
@@ -493,7 +530,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                     if len(sents) > 0 and all(len(s) > 0 for s in sents):
                         s_embs = embed(sents)
                         s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
-                        si = int(torch.argmax(s_sims).item())  # Use .item() to get scalar
                         if 0 <= si < len(sents):
                             best_sent = sents[si]
                 except Exception as e:
@@ -504,23 +541,23 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                 if not best_sent or len(best_sent.strip()) == 0:
                     best_sent = blk if blk else "Unable to extract sentence from this section."
-                # Check if anchor is in THIS specific sentence (case-insensitive)
                 sentence_lower = best_sent.lower()
                 anchor_found_in_sentence = anchor_text_lower in sentence_lower
                 # If not found with exact match, try normalized
                 if not anchor_found_in_sentence:
-                    normalized_sent = re.sub(r'[^a-z0-9\s]', '', sentence_lower)
-                    normalized_anchor = re.sub(r'[^a-z0-9\s]', '', anchor_text_lower)
                     anchor_found_in_sentence = normalized_anchor in normalized_sent
                 rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
                 result = {
-                    "anchor_was_present": anchor_found_in_sentence,  # Use the sentence-specific check
                     "best_sentence_original": best_sent,
                     "best_sentence_with_anchor": rewritten_sent,
-                    "keyword_in_article": keyword_present  # This is for the whole article
                 }
                 # If anchor not present in article and alternative suggestion requested
@@ -544,6 +581,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
             except Exception as e:
                 print(f"Error processing block {idx}: {e}")
                 # Add a fallback result
                 results.append({
                     "anchor_was_present": False,
@@ -618,7 +657,7 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
         "(2) Do NOT use an em dash or any dash. "
         '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
         "Prefer integrating the anchor as part of the sentence. "
-        f"(4) Write in {language} and preserve ALL special characters (č, ć, š, ž, đ, etc.). "
         "Return a compact JSON object with key sentence_html only."
     )
@@ -711,7 +750,7 @@ def to_plain_text(html_or_text):
     return html.unescape(text)
 # =========================
-# Gradio UI - FIXED VERSION
 # =========================
 def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
     if not source_url or not target_url or not anchor_text:
@@ -747,43 +786,55 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
     # Check if anchor was already present in the article
     anchor_was_present = res.get("anchor_was_present", False)
-    # Only apply GPT rewriting if anchor wasn't already present
-    # If anchor is present, we just want to show where to add the link
-    if anchor_was_present:
-        # Anchor exists - just show where to add the link
-        final_output = to_plain_text(draft_html) if plain_text else draft_html
-        result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-        result += f"📍 Add link here:\n\n"
-        result += f"Original: {original_sentence}\n\n"
-        result += f"With link: {final_output}"
     else:
-        # Anchor doesn't exist - need to add it to the sentence
-        # 1) Optional first-pass rewrite with language support
         if smart_rewrite:
             g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
             final_html = g["sentence_html"]
         else:
             final_html = draft_html
-        # 2) QA/polish pass with language support
         polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
         final_html = polished.get("sentence_html", final_html)
-        # 3) Optionally convert to plain text
         final_output = to_plain_text(final_html) if plain_text else final_html
-        # Build the result for when anchor is NOT present
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
         result += f"📍 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
-        # ONLY show alternative if:
-        # 1. suggest_alternative_anchor is True
-        # 2. The original anchor was NOT found in the article
-        # 3. We have an alternative suggestion
-        if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
             alt_anchor = res["alternative_anchor"]
             alt_sentence_original = res.get("alternative_sentence_original", "")
             alt_sentence = res.get("alternative_sentence", "")
@@ -844,7 +895,7 @@ with gr.Blocks(title=f"Link Insertion Helper • GPT: {gpt_status}") as demo:
                 plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
                 suggest_alternative_anchor = gr.Checkbox(
                     label="Suggest alternative anchor",
-                    value=True,  # ← CHANGED TO TRUE (DEFAULT CHECKED)
                     info="If anchor not found, suggest a better anchor from the article"
                 )
@@ -869,7 +920,7 @@ with gr.Blocks(title=f"Link Insertion Helper • GPT: {gpt_status}") as demo:
     gr.Markdown("""
     ### Features:
-    - 🌍 **Auto Language Detection**: Preserves special characters (č, ć, š, ž, đ, etc.)
     - 💾 **Smart Caching**: Caches embeddings and API responses for faster repeated queries
     - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
     - 🔄 **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text

     return False
 def get_text_blocks(url, max_paragraphs=7):
+    """Extract text blocks with improved extraction that captures all content."""
     try:
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         resp = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
         resp.raise_for_status()
         resp.encoding = resp.apparent_encoding
         soup = BeautifulSoup(resp.text, "html.parser")
     except Exception as e:
         print(f"Error fetching URL {url}: {e}")
         try:
+            # Fallback to simpler headers
             resp = requests.get(url, timeout=20, headers=UA)
             resp.raise_for_status()
             resp.encoding = resp.apparent_encoding
         tag.decompose()
     blocks = []
+    seen_texts = set()
+    # Try to find main content area - be more inclusive
+    main_content = None
+    content_selectors = [
+        ('main', {}),
+        ('article', {}),
+        ('div', {'class': re.compile('content|main|article|post|entry|blog|body|wrapper', re.I)}),
+        ('div', {'id': re.compile('content|main|article|post|entry|body|wrapper', re.I)}),
+        ('div', {'role': 'main'}),
+        ('div', {'class': 'container'}),
+    ]
+    for tag, attrs in content_selectors:
+        if attrs:
+            main_content = soup.find(tag, attrs)
+        else:
+            main_content = soup.find(tag)
+        if main_content:
+            break
     if not main_content:
         main_content = soup.body if soup.body else soup
+    # Method 1: Get ALL text from the main content area first
+    # This ensures we don't miss any content
+    full_text = main_content.get_text(separator="\n")
+    # Split by newlines and process
+    lines = full_text.split('\n')
+    temp_blocks = []
+    for line in lines:
+        clean_line = " ".join(line.strip().split())
+        if len(clean_line) > 60:  # Only keep substantial lines
+            if clean_line not in seen_texts:
+                temp_blocks.append(clean_line)
+                seen_texts.add(clean_line)
+    # Method 2: Also get specific HTML elements for better structure
+    for element in main_content.find_all(['p', 'div', 'li', 'h1', 'h2', 'h3', 'h4', 'blockquote'], recursive=True):
+        # Skip if likely author bio or footer
+        if is_likely_author_bio_or_footer(element, element.get_text()):
             continue
+        # For divs, skip if they contain other block elements (they're containers)
+        if element.name == 'div':
+            if element.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article', 'section']):
                 continue
+        txt = " ".join(element.get_text(" ", strip=True).split())
+        # Add to blocks if substantial and not duplicate
+        if len(txt) > 60 and txt not in seen_texts:
+            blocks.append(txt)
+            seen_texts.add(txt)
+    # If we got blocks from method 2, use those (better structure)
+    # Otherwise, use the temp_blocks from method 1
+    if not blocks and temp_blocks:
+        blocks = temp_blocks[:max_paragraphs]
+    elif len(blocks) < max_paragraphs and temp_blocks:
+        # Combine both methods - add any unique blocks from temp_blocks
+        for tb in temp_blocks:
+            if tb not in seen_texts:
+                blocks.append(tb)
+                seen_texts.add(tb)
+                if len(blocks) >= max_paragraphs:
+                    break
+    # Limit to max_paragraphs
+    blocks = blocks[:max_paragraphs]
+    # Debug output
+    print(f"\nExtracted {len(blocks)} blocks from {url}")
+    if blocks:
+        print(f"First block preview: {blocks[0][:200]}...")
+        # Check if we have reasonable content
+        full_extracted = " ".join(blocks)
+        print(f"Total extracted text length: {len(full_extracted)} chars")
+    else:
+        print("WARNING: No blocks extracted!")
     return blocks
     rewritten = f'{base}{clause}{punct}'
     return rewritten, False
 def find_alternative_anchor(blocks, target_url, original_anchor):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
         if not blocks:
             return [{"error":"No text blocks found on the page."}]
+        # DEBUG: Print what we extracted
+        print("\n" + "="*50)
+        print(f"DEBUG: Looking for anchor: '{anchor_text}'")
+        print("="*50)
+        # Check if keyword is present in the article
         full_text = " ".join(blocks)
         full_text_lower = full_text.lower()
         anchor_text_lower = anchor_text.lower() if anchor_text else ""
+        # Multiple ways to check for the anchor
+        keyword_present = False
+        # Method 1: Direct case-insensitive search
+        if anchor_text_lower in full_text_lower:
+            keyword_present = True
+            print(f"Found anchor via direct search")
+        # Method 2: Normalized search (remove extra spaces)
         if not keyword_present:
+            normalized_full = re.sub(r'\s+', ' ', full_text_lower)
+            normalized_anchor = re.sub(r'\s+', ' ', anchor_text_lower)
+            if normalized_anchor in normalized_full:
+                keyword_present = True
+                print(f"Found anchor via normalized search")
+        # Method 3: Check each block individually
+        if not keyword_present:
+            for i, block in enumerate(blocks):
+                if anchor_text_lower in block.lower():
+                    keyword_present = True
+                    print(f"Found anchor in block {i}: {block[:100]}...")
+                    break
+        print(f"Keyword present in article: {keyword_present}")
+        # Target context for similarity matching
         try:
             tgt_html = requests.get(target_url, timeout=20, headers=UA).text
             tt = BeautifulSoup(tgt_html, "html.parser").title
         ext = tldextract.extract(target_url)
         tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+        # Find best match with original anchor
+        query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
         try:
             q_emb = embed([query])[0]
         results = []
         for idx in top_idx:
             try:
+                idx = min(idx, len(blocks)-1)  # Ensure valid index
+                blk = blocks[idx]
                 # Split sentences more carefully
                 sents = re.split(r'(?<=[.!?])\s+', blk)
                     if len(sents) > 0 and all(len(s) > 0 for s in sents):
                         s_embs = embed(sents)
                         s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
+                        si = int(torch.argmax(s_sims).item())
                         if 0 <= si < len(sents):
                             best_sent = sents[si]
                 except Exception as e:
                 if not best_sent or len(best_sent.strip()) == 0:
                     best_sent = blk if blk else "Unable to extract sentence from this section."
+                # Check if anchor is in THIS specific sentence
                 sentence_lower = best_sent.lower()
                 anchor_found_in_sentence = anchor_text_lower in sentence_lower
                 # If not found with exact match, try normalized
                 if not anchor_found_in_sentence:
+                    normalized_sent = re.sub(r'\s+', ' ', sentence_lower)
+                    normalized_anchor = re.sub(r'\s+', ' ', anchor_text_lower)
                     anchor_found_in_sentence = normalized_anchor in normalized_sent
                 rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
                 result = {
+                    "anchor_was_present": anchor_found_in_sentence,
                     "best_sentence_original": best_sent,
                     "best_sentence_with_anchor": rewritten_sent,
+                    "keyword_in_article": keyword_present
                 }
                 # If anchor not present in article and alternative suggestion requested
             except Exception as e:
                 print(f"Error processing block {idx}: {e}")
+                import traceback
+                traceback.print_exc()
                 # Add a fallback result
                 results.append({
                     "anchor_was_present": False,
         "(2) Do NOT use an em dash or any dash. "
         '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
         "Prefer integrating the anchor as part of the sentence. "
+        f"(4) Write in {language} and preserve ALL special characters (ć, č, š, ž, đ, etc.). "
         "Return a compact JSON object with key sentence_html only."
     )
     return html.unescape(text)
 # =========================
+# Gradio UI
 # =========================
 def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
     if not source_url or not target_url or not anchor_text:
     # Check if anchor was already present in the article
     anchor_was_present = res.get("anchor_was_present", False)
+    keyword_in_article = res.get("keyword_in_article", False)
+    # If anchor is present in the article (even if not in the best sentence)
+    if keyword_in_article:
+        # Anchor exists somewhere in article
+        if anchor_was_present:
+            # Anchor is in the suggested sentence - just show where to add the link
+            final_output = to_plain_text(draft_html) if plain_text else draft_html
+            result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"📍 Add link here:\n\n"
+            result += f"Original: {original_sentence}\n\n"
+            result += f"With link: {final_output}"
+        else:
+            # Anchor is in article but not in this sentence - show this sentence as an option
+            # and note that the anchor exists elsewhere
+            if smart_rewrite:
+                g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
+                final_html = g["sentence_html"]
+            else:
+                final_html = draft_html
+            polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
+            final_html = polished.get("sentence_html", final_html)
+            final_output = to_plain_text(final_html) if plain_text else final_html
+            result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"📍 The anchor appears elsewhere in the article. Here's a contextually relevant placement:\n\n"
+            result += f"Original: {original_sentence}\n\n"
+            result += f"Suggested: {final_output}\n\n"
+            result += f"💡 Note: You may want to search for '{anchor_text}' in the article to find where it naturally appears."
     else:
+        # Anchor doesn't exist in article at all - need to add it
         if smart_rewrite:
             g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
             final_html = g["sentence_html"]
         else:
             final_html = draft_html
         polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
         final_html = polished.get("sentence_html", final_html)
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
         result += f"📍 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
+        # Show alternative if requested and available
+        if suggest_alternative_anchor and res.get("alternative_anchor"):
             alt_anchor = res["alternative_anchor"]
             alt_sentence_original = res.get("alternative_sentence_original", "")
             alt_sentence = res.get("alternative_sentence", "")
                 plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
                 suggest_alternative_anchor = gr.Checkbox(
                     label="Suggest alternative anchor",
+                    value=True,
                     info="If anchor not found, suggest a better anchor from the article"
                 )
     gr.Markdown("""
     ### Features:
+    - 🌍 **Auto Language Detection**: Preserves special characters (ć, č, š, ž, đ, etc.)
     - 💾 **Smart Caching**: Caches embeddings and API responses for faster repeated queries
     - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
     - 🔄 **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text