Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

6e710b3

verified ·

1 Parent(s): 8b2a25c

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -166

app.py CHANGED Viewed

@@ -261,163 +261,136 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     rewritten = f'{base}{clause}{punct}'
     return rewritten, False
-def find_alternative_anchor(blocks, target_url, original_anchor, target_context=None):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
-        # Use provided target context or analyze the URL
-        if not target_context:
-            target_context = analyze_target_url(target_url)
-        # Extract key concepts from target page to understand what it's about
-        target_title = target_context.get("title", "").lower()
-        target_meta = target_context.get("meta_description", "").lower()
-        target_headings = " ".join(target_context.get("headings", [])).lower()
-        target_content = target_context.get("main_content", "").lower()
-        # Extract important keywords from target page (what the page is ACTUALLY about)
-        target_keywords = set()
-        # Common service/product related terms from the target
-        important_words = []
-        for text in [target_title, target_meta, target_headings, target_content[:500]]:
-            # Extract meaningful nouns and phrases
-            words = text.split()
-            for word in words:
-                clean = word.strip('.,!?;:"\'()[]{}')
-                if len(clean) > 3 and clean.isalpha():
-                    important_words.append(clean)
-                    if len(clean) > 4:  # Add to keywords
-                        target_keywords.add(clean)
-        print(f"\nTarget page keywords detected: {list(target_keywords)[:10]}")
-        # Now search for phrases in source article
         full_text = " ".join(blocks)
-        sentences = re.split(r'[.!?]', full_text)
-        candidate_anchors = {}  # phrase -> (sentence, score, needs_bridge)
         for sentence in sentences:
-            if not sentence or len(sentence.strip()) < 20:
                 continue
-            sentence_lower = sentence.lower()
             words = sentence.split()
-            # Look for ALL potential phrases, even loosely related ones
-            # Single important words
-            for word in words:
-                clean_word = word.strip('.,!?;:"\'()[]{}')
-                if (len(clean_word) > 4 and clean_word.isalpha()):
-                    # Calculate relevance even for loose matches
-                    try:
-                        word_emb = embed([clean_word])[0]
-                        target_emb = embed([target_context.get("summary", "")[:500]])[0]
-                        semantic_score = F.cosine_similarity(
-                            word_emb.unsqueeze(0),
-                            target_emb.unsqueeze(0)
-                        ).item()
-                        # Lower threshold for considering candidates
-                        if semantic_score > 0.15:  # Much lower threshold
-                            needs_bridge = semantic_score < 0.3  # Mark if needs bridge content
-                            if clean_word not in candidate_anchors or candidate_anchors[clean_word][1] < semantic_score:
-                                candidate_anchors[clean_word] = (sentence.strip(), semantic_score, needs_bridge)
-                    except:
-                        continue
-            # Look for 2-4 word phrases
             for length in range(2, min(5, len(words) + 1)):
                 for i in range(len(words) - length + 1):
-                    if i < 0 or i + length > len(words):
-                        continue
-                    phrase_words = words[i:i+length]
-                    phrase = ' '.join(phrase_words)
-                    phrase_clean = phrase.strip('.,!?;:"\'()')
-                    # More lenient filtering
-                    skip_words = {'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then', 'than'}
-                    first_word = phrase_words[0].lower().strip('.,!?;:"\'')
-                    last_word = phrase_words[-1].lower().strip('.,!?;:"\'')
-                    # Allow more phrases through
-                    if (len(phrase_clean) < 5 or
-                        len(phrase_clean) > 50 or
-                        not phrase_clean[0].isalpha()):
                         continue
-                    # Skip only the worst fragments
-                    if first_word in skip_words and last_word in skip_words:
-                        continue
-                    # Calculate relevance score
-                    try:
-                        phrase_emb = embed([phrase_clean])[0]
-                        target_emb = embed([target_context.get("summary", "")[:500]])[0]
-                        semantic_score = F.cosine_similarity(
-                            phrase_emb.unsqueeze(0),
-                            target_emb.unsqueeze(0)
-                        ).item()
-                        # Accept even loosely related phrases
-                        if semantic_score > 0.15:  # Much lower threshold
-                            needs_bridge = semantic_score < 0.3  # Mark if needs bridge
-                            # Check for topic-related words (beauty, skincare, nail, etc.)
-                            bonus = 0
-                            general_beauty_terms = ['beauty', 'skincare', 'cosmetic', 'product', 'treatment',
-                                                  'care', 'skin', 'nail', 'makeup', 'store', 'shop',
-                                                  'korean', 'k-beauty', 'routine', 'regimen']
-                            for term in general_beauty_terms:
-                                if term in phrase_clean.lower():
-                                    bonus = 0.1
-                                    break
-                            total_score = semantic_score + bonus
-                            if phrase_clean not in candidate_anchors or candidate_anchors[phrase_clean][1] < total_score:
-                                candidate_anchors[phrase_clean] = (sentence.strip(), total_score, needs_bridge)
-                                if total_score > 0.2:  # Only print decent candidates
-                                    print(f"  Candidate: '{phrase_clean}' (score: {total_score:.3f}, needs_bridge: {needs_bridge})")
-                    except:
-                        continue
-        # If no candidates at all, try to find ANY noun phrase in the article
-        if not candidate_anchors:
-            print("\nNo semantic matches found, looking for any noun phrases...")
-            for sentence in sentences[:10]:  # Check first 10 sentences
-                words = sentence.split()
-                for word in words:
-                    clean_word = word.strip('.,!?;:"\'()[]{}')
-                    # Any proper noun or long word
-                    if clean_word and len(clean_word) > 5 and clean_word[0].isupper():
-                        candidate_anchors[clean_word] = (sentence.strip(), 0.1, True)  # Low score, needs bridge
-                        break
-                if candidate_anchors:
-                    break
-        # Select the best anchor from candidates
-        if not candidate_anchors:
-            print("\n✗ No alternative anchor found at all")
-            return None, None, False
-        # Sort by score and get the best one
-        sorted_candidates = sorted(candidate_anchors.items(), key=lambda x: x[1][1], reverse=True)
-        best_anchor, (best_sentence, best_score, needs_bridge) = sorted_candidates[0]
-        print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
-        if needs_bridge:
-            print(f"  → Will need bridge paragraph to connect to target topic")
-        return best_anchor, best_sentence, needs_bridge
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
-        import traceback
-        traceback.print_exc()
-        return None, None, False
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
@@ -669,8 +642,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                 if suggest_alternative and not keyword_present:
                     try:
                         # Find a completely different anchor and sentence
-                        # Pass the target_context we already analyzed
-                        alt_anchor, alt_sentence, needs_bridge = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
                         if alt_anchor and alt_sentence:
                             # Create the sentence with the alternative anchor
@@ -679,7 +651,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                             result["alternative_sentence_original"] = alt_sentence
                             result["alternative_sentence"] = alt_rewritten
                             result["alternative_exact_match"] = alt_exact
-                            result["needs_bridge_paragraph"] = needs_bridge
                     except Exception as e:
                         print(f"Error finding alternative anchor: {e}")
                         # Continue without alternative
@@ -798,51 +769,89 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
     # Don't check for exact anchor text match as it might have special chars
     return {"sentence_html": out}
-def gpt_create_bridge_paragraph(anchor_text, sentence, target_url, target_context, language="English"):
-    """Create a bridge paragraph that naturally connects loosely related topics."""
     if not OPENAI_API_KEY:
-        return {"paragraph": sentence}
     # Create cache key
-    cache_key = hashlib.md5(f"bridge_{anchor_text}{sentence}{target_url}{language}".encode()).hexdigest()
-    target_title = target_context.get("title", "")
-    target_topic = target_context.get("meta_description", "")
     system = (
-        f"You are a skilled content writer writing in {language}. "
-        f"IMPORTANT: Preserve all special characters and diacritics from the {language} language. "
-        "Your task is to create a natural bridge paragraph that connects two loosely related topics. "
-        "The paragraph should flow naturally from the source topic to the target topic. "
         "RULES: "
-        "(1) Start with the context from the source article "
-        "(2) Create a natural transition to the target topic "
-        "(3) Include the anchor link naturally "
-        "(4) Make it 2-3 sentences that feel organic, not forced "
-        "(5) Avoid obvious transitions like 'Speaking of...' or 'On a related note...' "
-        "Return JSON with key 'paragraph' containing the HTML with the link included."
     )
     user = {
-        "task": "create_bridge_paragraph",
-        "source_context": sentence,
-        "anchor_text": anchor_text,
         "target_url": target_url,
-        "target_title": target_title,
-        "target_topic": target_topic,
         "language": language,
-        "instructions": "Create a smooth, natural paragraph that connects these topics"
     }
     try:
         obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
         return obj
-    except:
-        try:
-            obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
-            return obj
-        except:
-            return {"paragraph": sentence}
     """
     Final QA pass with language support.
     """

     rewritten = f'{base}{clause}{punct}'
     return rewritten, False
+def find_alternative_anchor(blocks, target_url, original_anchor):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
+        # Get target page context
+        try:
+            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+            soup = BeautifulSoup(tgt_html, "html.parser")
+            # Extract target page title and meta description
+            title = soup.title.get_text().strip() if soup.title else ""
+            meta_desc = ""
+            meta_tag = soup.find("meta", attrs={"name": "description"})
+            if meta_tag:
+                meta_desc = meta_tag.get("content", "")
+            # Extract key terms from target page (first few paragraphs)
+            target_paragraphs = []
+            for p in soup.find_all("p")[:5]:
+                text = p.get_text().strip()
+                if len(text) > 50:
+                    target_paragraphs.append(text)
+            target_content = " ".join(target_paragraphs[:3])
+        except Exception as e:
+            print(f"Error fetching target URL: {e}")
+            title = ""
+            meta_desc = ""
+            target_content = original_anchor
+        # Extract all potential anchor phrases from the source article
+        all_phrases = set()
         full_text = " ".join(blocks)
+        # Common words to exclude
+        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+                     'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
+                     'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+                     'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
+                     'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
+        # Extract noun phrases and important terms (2-4 words)
+        sentences = re.split(r'[.!?]', full_text)
         for sentence in sentences:
+            if not sentence:
                 continue
             words = sentence.split()
+            # Extract phrases of 2-4 words
             for length in range(2, min(5, len(words) + 1)):
                 for i in range(len(words) - length + 1):
+                    if i < 0 or i+length > len(words):
                         continue
+                    phrase = ' '.join(words[i:i+length])
+                    phrase_clean = phrase.strip('.,!?;:"\'')
+                    # Check if phrase is meaningful
+                    if i < len(words) and i+length-1 < len(words):
+                        first_word = words[i].lower().strip('.,!?;:')
+                        last_word = words[i+length-1].lower().strip('.,!?;:')
+                        # Skip if starts/ends with stopwords or is too short
+                        if (first_word not in stopwords and
+                            last_word not in stopwords and
+                            len(phrase_clean) > 5 and
+                            len(phrase_clean) < 50):
+                            all_phrases.add(phrase_clean)
+            # Also extract single important words (proper nouns, long words)
+            for word in words:
+                clean_word = word.strip('.,!?;:"\'')
+                if clean_word and (len(clean_word) > 6 or
+                    (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
+                    all_phrases.add(clean_word)
+        if not all_phrases:
+            return None, None
+        # Create context query from target URL info
+        target_context = f"{title} {meta_desc} {target_content}"[:500]
+        # Score each phrase based on relevance to target
+        try:
+            target_emb = embed([target_context])[0]
+        except:
+            return None, None
+        best_anchor = None
+        best_score = -1
+        best_sentence = None
+        # Evaluate each potential anchor
+        for phrase in list(all_phrases)[:50]:  # Limit to first 50 to avoid too much processing
+            # Skip if too similar to original anchor (we want something different)
+            if phrase.lower() == original_anchor.lower():
+                continue
+            try:
+                # Score this phrase against target context
+                phrase_emb = embed([phrase])[0]
+                relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+                # Check if this phrase appears in article and find its best context
+                if phrase.lower() in full_text.lower():
+                    # Find sentences containing this phrase
+                    for block in blocks:
+                        if phrase.lower() in block.lower():
+                            sents = re.split(r'(?<=[.!?])\s+', block)
+                            for sent in sents:
+                                if sent and phrase.lower() in sent.lower():
+                                    # Score this sentence-phrase combination
+                                    try:
+                                        sent_emb = embed([sent])[0]
+                                        context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+                                        combined_score = (relevance_score * 0.6) + (context_score * 0.4)
+                                        if combined_score > best_score:
+                                            best_score = combined_score
+                                            best_anchor = phrase
+                                            best_sentence = sent
+                                    except:
+                                        continue
+            except Exception as e:
+                print(f"Error evaluating phrase '{phrase}': {e}")
+                continue
+        return best_anchor, best_sentence
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
+        return None, None
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
                 if suggest_alternative and not keyword_present:
                     try:
                         # Find a completely different anchor and sentence
+                        alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
                         if alt_anchor and alt_sentence:
                             # Create the sentence with the alternative anchor
                             result["alternative_sentence_original"] = alt_sentence
                             result["alternative_sentence"] = alt_rewritten
                             result["alternative_exact_match"] = alt_exact
                     except Exception as e:
                         print(f"Error finding alternative anchor: {e}")
                         # Continue without alternative
     # Don't check for exact anchor text match as it might have special chars
     return {"sentence_html": out}
+def gpt_get_target_keywords(target_url, target_context, language="English"):
+    """Ask GPT to suggest 5-10 relevant search keywords users would use to find this page."""
     if not OPENAI_API_KEY:
+        return []
     # Create cache key
+    cache_key = hashlib.md5(f"keywords_{target_url}{language}".encode()).hexdigest()
+    if cache_key in API_RESPONSE_CACHE:
+        print(f"[GPT] Using cached keywords for {target_url[:30]}...")
+        return API_RESPONSE_CACHE[cache_key].get("keywords", [])
+    title = target_context.get("title", "")
+    meta = target_context.get("meta_description", "")
+    content = target_context.get("main_content", "")[:500]
     system = (
+        "You are an SEO expert. Based on the page content provided, suggest 5-10 search keywords or phrases "
+        "that users would likely type into Google to find this page. "
+        "Include both short keywords (1-2 words) and long-tail keywords (3-5 words). "
+        "Make them realistic search terms, not just words from the page. "
+        f"Consider the {language} language and local search patterns. "
+        "Return JSON with a 'keywords' array."
+    )
+    user = {
+        "url": target_url,
+        "title": title,
+        "meta_description": meta,
+        "content_preview": content,
+        "task": "Generate search keywords users would use to find this page"
+    }
+    try:
+        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+        keywords = obj.get("keywords", [])
+        print(f"\n[GPT] Target page keywords: {keywords}")
+        return keywords
+    except Exception as e:
+        print(f"[GPT] Error getting keywords: {e}")
+        return []
+def gpt_add_keyword_to_content(blocks, keywords, target_url, language="English"):
+    """Ask GPT to naturally add one of the keywords to the content with proper context."""
+    if not OPENAI_API_KEY or not keywords:
+        return None
+    # Create cache key
+    blocks_preview = " ".join(blocks[:3])[:500]
+    cache_key = hashlib.md5(f"add_kw_{blocks_preview}{str(keywords)}{target_url}".encode()).hexdigest()
+    if cache_key in API_RESPONSE_CACHE:
+        return API_RESPONSE_CACHE[cache_key]
+    system = (
+        f"You are a skilled content editor writing in {language}. "
+        "Your task is to naturally integrate ONE of the provided keywords into the article content. "
         "RULES: "
+        "1. Choose the keyword that fits most naturally with the existing content "
+        "2. Add 2-3 sentences or a short paragraph that includes the keyword "
+        "3. Make it flow naturally - it should feel like it belongs there "
+        "4. Include an HTML link using the keyword as anchor text "
+        "5. Specify WHERE to add it (e.g., 'after the second paragraph', 'before the conclusion') "
+        "6. The addition should provide value, not just keyword stuffing "
+        f"7. Write in {language} and preserve special characters "
+        "Return JSON with: 'keyword_used', 'content_to_add', 'placement_instruction'"
     )
     user = {
+        "article_preview": " ".join(blocks[:5]),
+        "available_keywords": keywords,
         "target_url": target_url,
         "language": language,
+        "task": "Add one keyword naturally to the content"
     }
     try:
         obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+        API_RESPONSE_CACHE[cache_key] = obj
         return obj
+    except Exception as e:
+        print(f"[GPT] Error adding keyword: {e}")
+        return None
     """
     Final QA pass with language support.
     """