Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

74325d3

verified ·

1 Parent(s): bb074b3

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -149

app.py CHANGED Viewed

@@ -262,134 +262,63 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     return rewritten, False
 def find_alternative_anchor(blocks, target_url, original_anchor):
-    """Find a better anchor text from the article that relates to the target URL."""
     try:
-        # Get target page context
-        try:
-            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-            soup = BeautifulSoup(tgt_html, "html.parser")
-            # Extract target page title and meta description
-            title = soup.title.get_text().strip() if soup.title else ""
-            meta_desc = ""
-            meta_tag = soup.find("meta", attrs={"name": "description"})
-            if meta_tag:
-                meta_desc = meta_tag.get("content", "")
-            # Extract key terms from target page (first few paragraphs)
-            target_paragraphs = []
-            for p in soup.find_all("p")[:5]:
-                text = p.get_text().strip()
-                if len(text) > 50:
-                    target_paragraphs.append(text)
-            target_content = " ".join(target_paragraphs[:3])
-        except Exception as e:
-            print(f"Error fetching target URL: {e}")
-            title = ""
-            meta_desc = ""
-            target_content = original_anchor
-        # Extract all potential anchor phrases from the source article
-        all_phrases = set()
-        full_text = " ".join(blocks)
-        # Common words to exclude
-        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
-                     'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
-                     'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
-                     'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
-                     'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
-        # Extract noun phrases and important terms (2-4 words)
-        sentences = re.split(r'[.!?]', full_text)
-        for sentence in sentences:
-            if not sentence:
-                continue
-            words = sentence.split()
-            # Extract phrases of 2-4 words
-            for length in range(2, min(5, len(words) + 1)):
-                for i in range(len(words) - length + 1):
-                    if i < 0 or i+length > len(words):
-                        continue
-                    phrase = ' '.join(words[i:i+length])
-                    phrase_clean = phrase.strip('.,!?;:"\' ')
-                    # Check if phrase is meaningful
-                    if i < len(words) and i+length-1 < len(words):
-                        first_word = words[i].lower().strip('.,!?;:')
-                        last_word = words[i+length-1].lower().strip('.,!?;:')
-                        # Skip if starts/ends with stopwords or is too short
-                        if (first_word not in stopwords and
-                            last_word not in stopwords and
-                            len(phrase_clean) > 5 and
-                            len(phrase_clean) < 50):
-                            all_phrases.add(phrase_clean)
-            # Also extract single important words (proper nouns, long words)
-            for word in words:
-                clean_word = word.strip('.,!?;:"\' ')
-                if clean_word and (len(clean_word) > 6 or
-                    (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
-                    all_phrases.add(clean_word)
-        if not all_phrases:
             return None, None
-        # Create context query from target URL info
-        target_context = f"{title} {meta_desc} {target_content}"[:500]
-        # Score each phrase based on relevance to target
-        try:
-            target_emb = embed([target_context])[0]
-        except:
             return None, None
-        best_anchor = None
-        best_score = -1
-        best_sentence = None
-        # Evaluate each potential anchor
-        for phrase in list(all_phrases)[:50]:  # Limit to first 50 to avoid too much processing
-            # Skip if too similar to original anchor (we want something different)
-            if phrase.lower() == original_anchor.lower():
-                continue
-            try:
-                # Score this phrase against target context
-                phrase_emb = embed([phrase])[0]
-                relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
-                # Check if this phrase appears in article and find its best context
-                if phrase.lower() in full_text.lower():
-                    # Find sentences containing this phrase
-                    for block in blocks:
-                        if phrase.lower() in block.lower():
-                            sents = re.split(r'(?<=[.!?])\s+', block)
-                            for sent in sents:
-                                if sent and phrase.lower() in sent.lower():
-                                    # Score this sentence-phrase combination
-                                    try:
-                                        sent_emb = embed([sent])[0]
-                                        context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
-                                        combined_score = (relevance_score * 0.6) + (context_score * 0.4)
-                                        if combined_score > best_score:
-                                            best_score = combined_score
-                                            best_anchor = phrase
-                                            best_sentence = sent
-                                    except:
-                                        continue
-            except Exception as e:
-                print(f"Error evaluating phrase '{phrase}': {e}")
-                continue
-        return best_anchor, best_sentence
     except Exception as e:
-        print(f"Critical error in find_alternative_anchor: {e}")
         return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
@@ -514,18 +443,16 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                 # If anchor not present in article and alternative suggestion requested
                 if suggest_alternative and not keyword_present:
                     try:
-                        # Find a completely different anchor and sentence
-                        alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
-                        if alt_anchor and alt_sentence:
-                            # Create the sentence with the alternative anchor
-                            alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
                             result["alternative_anchor"] = alt_anchor
-                            result["alternative_sentence_original"] = alt_sentence
-                            result["alternative_sentence"] = alt_rewritten
-                            result["alternative_exact_match"] = alt_exact
                     except Exception as e:
-                        print(f"Error finding alternative anchor: {e}")
                         # Continue without alternative
                 results.append(result)
@@ -694,6 +621,97 @@ def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="En
     return {"sentence_html": out}
 def to_plain_text(html_or_text):
     """Convert HTML to plain text, properly handling special characters."""
     text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
@@ -746,7 +764,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             # Anchor is in the suggested sentence - just show where to add the link
             final_output = to_plain_text(draft_html) if plain_text else draft_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-            result += f"📍 Add link here:\n\n"
             result += f"{final_output}"
         else:
             # Anchor is in article but not in this sentence
@@ -761,7 +779,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             final_output = to_plain_text(final_html) if plain_text else final_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
-            result += f"📍 Add link here:\n\n"
             result += f"{final_output}"
     else:
         # Anchor doesn't exist in article at all - need to add it
@@ -776,41 +794,35 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
-        result += f"📍 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
         # Show alternative if requested and available
         if suggest_alternative_anchor and res.get("alternative_anchor"):
             alt_anchor = res["alternative_anchor"]
-            alt_sentence_original = res.get("alternative_sentence_original", "")
-            alt_sentence = res.get("alternative_sentence", "")
-            # Detect language for alternative sentence
-            if alt_sentence_original:
-                alt_detected_lang = detect_language(alt_sentence_original)
-                alt_language_name = get_language_name(alt_detected_lang)
-                # Apply GPT rewriting to alternative as well
-                if smart_rewrite and alt_sentence:
-                    alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
-                    alt_final = alt_g["sentence_html"]
                 else:
-                    alt_final = alt_sentence
-                # Polish if needed
-                if not res.get("alternative_exact_match", False):
-                    alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
-                    alt_final = alt_polished.get("sentence_html", alt_final)
-                alt_output = to_plain_text(alt_final) if plain_text else alt_final
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
-                result += f"📍 Result 2 - Alternative from article:\n"
-                result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
-                result += f"Original: {alt_sentence_original}\n\n"
-                result += f"Suggested: {alt_output}"
     return result

     return rewritten, False
 def find_alternative_anchor(blocks, target_url, original_anchor):
+    """
+    NEW VERSION: Generate new content with keywords from target page.
+    """
     try:
+        print(f"[Alternative] Extracting target page content from {target_url}")
+        # Step 1: Extract target page content using Trafilatura
+        target_blocks = get_text_blocks(target_url, max_paragraphs=5)
+        if not target_blocks:
+            print("[Alternative] No content extracted from target page")
+            return None, None
+        print(f"[Alternative] Extracted {len(target_blocks)} blocks from target")
+        # Step 2: Get search keywords from target content
+        keywords = gpt_get_search_keywords(target_blocks, target_url)
+        print(f"[Alternative] Keywords identified: {keywords}")
+        if not keywords or not isinstance(keywords, list):
+            print("[Alternative] No valid keywords returned")
             return None, None
+        # Step 3: Detect language from source article
+        source_text = " ".join(blocks[:2])  # Use first 2 paragraphs for detection
+        detected_lang = detect_language(source_text)
+        language_name = get_language_name(detected_lang)
+        print(f"[Alternative] Detected language: {language_name}")
+        # Step 4: Generate new content with keyword
+        result = gpt_generate_content_with_keyword(
+            source_blocks=blocks,
+            keywords=keywords,
+            target_url=target_url,
+            language=language_name
+        )
+        if not result:
+            print("[Alternative] Content generation failed")
             return None, None
+        # Return in format compatible with existing code
+        chosen_keyword = result.get("chosen_keyword", keywords[0] if keywords else original_anchor)
+        new_content = result.get("new_content", "")
+        insert_after = result.get("insert_after_paragraph", 0)
+        reasoning = result.get("reasoning", "")
+        # Format the response for compatibility
+        # Return: (anchor_text, formatted_content_with_position)
+        position_text = f"[Insert after paragraph {insert_after + 1}]: {reasoning}"
+        return chosen_keyword, f"{position_text}\n\n{new_content}"
     except Exception as e:
+        print(f"[Alternative] Critical error: {e}")
+        import traceback
+        traceback.print_exc()
         return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
                 # If anchor not present in article and alternative suggestion requested
                 if suggest_alternative and not keyword_present:
                     try:
+                        # Generate new content with keywords from target page
+                        alt_anchor, alt_content = find_alternative_anchor(blocks, target_url, anchor_text)
+                        if alt_anchor and alt_content:
                             result["alternative_anchor"] = alt_anchor
+                            result["alternative_sentence_original"] = ""  # No original since it's new content
+                            result["alternative_sentence"] = alt_content
+                            result["alternative_exact_match"] = True  # It's generated with the link
                     except Exception as e:
+                        print(f"Error generating alternative content: {e}")
                         # Continue without alternative
                 results.append(result)
     return {"sentence_html": out}
+def gpt_get_search_keywords(target_content, target_url):
+    """
+    Analyze target page content and get search keywords people would use.
+    """
+    if not OPENAI_API_KEY:
+        return ["related content", "learn more", "additional information"]
+    # Create cache key
+    cache_key = hashlib.md5(f"keywords_{target_url}_{target_content[:500]}".encode()).hexdigest()
+    system = (
+        "You are an SEO expert. Analyze the provided web page content and identify "
+        "5-10 search keywords or phrases that people would typically use to find this page. "
+        "Focus on practical, real search terms that users would type into Google. "
+        "Return a JSON object with a 'keywords' array containing 5-10 keyword phrases."
+    )
+    # Limit content to avoid token limits
+    content_preview = " ".join(target_content[:5]) if isinstance(target_content, list) else target_content[:3000]
+    user = {
+        "task": "identify_search_keywords",
+        "page_content": content_preview,
+        "url": target_url,
+        "requirements": {
+            "count": "5-10 keywords",
+            "type": "practical search terms",
+            "focus": "what users would actually search for"
+        }
+    }
+    try:
+        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+    except Exception as e:
+        print(f"[GPT] Keywords extraction failed: {e}")
+        return ["related content", "learn more", "additional information"]
+    return obj.get("keywords", ["related content"])
+def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
+    """
+    Generate new content with the best keyword and specify where to insert it.
+    """
+    if not OPENAI_API_KEY or not keywords:
+        return None
+    # Create cache key
+    source_preview = " ".join(source_blocks[:3])[:500]
+    cache_key = hashlib.md5(f"generate_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
+    system = (
+        f"You are a skilled content writer writing in {language}. "
+        "Given an article and a list of keywords related to a target page, "
+        "create a NATURAL addition to the article that incorporates the most suitable keyword. "
+        "The addition should flow seamlessly with the existing content. "
+        "\n\nYOUR TASK:\n"
+        "1. Choose the ONE keyword that fits most naturally with the article's context\n"
+        "2. Create new content (1-3 sentences OR a paragraph if needed) that naturally includes this keyword\n"
+        "3. Specify AFTER which paragraph number (0-based) to insert this content\n"
+        "4. The keyword should be wrapped in an HTML link to the target URL\n"
+        f"5. Write in {language} and preserve special characters\n"
+        "\n\nReturn JSON with keys:\n"
+        "- 'chosen_keyword': the keyword you selected\n"
+        "- 'new_content': the HTML content with <a href> link\n"
+        "- 'insert_after_paragraph': paragraph number (0-based) after which to insert\n"
+        "- 'reasoning': brief explanation of placement choice"
+    )
+    user = {
+        "article_paragraphs": source_blocks[:7],  # First 7 paragraphs for context
+        "available_keywords": keywords,
+        "target_url": target_url,
+        "language": language,
+        "requirements": {
+            "natural_flow": True,
+            "include_link": True,
+            "preserve_tone": True
+        }
+    }
+    try:
+        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+        return obj
+    except Exception as e:
+        print(f"[GPT] Content generation failed: {e}")
+        try:
+            obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
+            return obj
+        except:
+            return None
 def to_plain_text(html_or_text):
     """Convert HTML to plain text, properly handling special characters."""
     text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
             # Anchor is in the suggested sentence - just show where to add the link
             final_output = to_plain_text(draft_html) if plain_text else draft_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"🔗 Add link here:\n\n"
             result += f"{final_output}"
         else:
             # Anchor is in article but not in this sentence
             final_output = to_plain_text(final_html) if plain_text else final_html
             result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
+            result += f"🔗 Add link here:\n\n"
             result += f"{final_output}"
     else:
         # Anchor doesn't exist in article at all - need to add it
         final_output = to_plain_text(final_html) if plain_text else final_html
         result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
+        result += f"🔗 Result 1 - Suggested placement:\n\n"
         result += f"Original: {original_sentence}\n\n"
         result += f"Suggested: {final_output}"
         # Show alternative if requested and available
         if suggest_alternative_anchor and res.get("alternative_anchor"):
             alt_anchor = res["alternative_anchor"]
+            alt_content = res.get("alternative_sentence", "")  # This now contains position info + content
+            if alt_content:
+                # Parse if there's position information
+                if "[Insert after paragraph" in alt_content:
+                    parts = alt_content.split("\n\n", 1)
+                    position_info = parts[0] if len(parts) > 0 else ""
+                    actual_content = parts[1] if len(parts) > 1 else alt_content
                 else:
+                    position_info = ""
+                    actual_content = alt_content
+                # The content already has the link included from GPT
+                alt_output = to_plain_text(actual_content) if plain_text else actual_content
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
+                result += f"🔗 Result 2 - Suggested new content to add:\n"
+                result += f"💡 Using keyword: '{alt_anchor}'\n"
+                if position_info:
+                    result += f"📍 {position_info}\n"
+                result += f"\n{alt_output}"
     return result