Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Build error

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

d107e20

verified ·

1 Parent(s): 4bc41f9

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -49

app.py CHANGED Viewed

@@ -226,34 +226,117 @@ def create_anchor_suggestion(anchor_text, target_url):
     ]
     return suggestions[0]
-def extract_potential_anchors(sentence, target_url):
-    """Extract potential anchor text phrases from a sentence."""
-    # Remove very common words and extract meaningful phrases
     stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                  'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
-                 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'}
-    # Split into words and find continuous phrases
-    words = sentence.split()
-    phrases = []
-    # Generate phrases of 2-5 words
-    for length in range(2, min(6, len(words) + 1)):
-        for i in range(len(words) - length + 1):
-            phrase = ' '.join(words[i:i+length])
-            # Check if phrase doesn't start/end with stopwords
-            first_word = words[i].lower().strip('.,!?;:')
-            last_word = words[i+length-1].lower().strip('.,!?;:')
-            if first_word not in stopwords and last_word not in stopwords:
-                phrases.append(phrase.strip('.,!?;:'))
-    # Also add significant single words (proper nouns, long words)
-    for word in words:
-        clean_word = word.strip('.,!?;:')
-        if (len(clean_word) > 7 or clean_word[0].isupper()) and clean_word.lower() not in stopwords:
-            phrases.append(clean_word)
-    return phrases[:5]  # Return top 5 potential anchors
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
     blocks = get_text_blocks(source_url)
@@ -302,27 +385,14 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
         # If anchor not present and alternative suggestion requested
         if suggest_alternative and not keyword_present:
-            # Extract potential anchor phrases from the best sentence
-            potential_anchors = extract_potential_anchors(best_sent, target_url)
-            # Find the best alternative anchor
-            best_alternative = None
-            best_alt_score = -1
-            for alt_anchor in potential_anchors:
-                # Check relevance to target
-                alt_query = f"{alt_anchor} — relevant to: {tgt_title}"
-                alt_q_emb = embed([alt_query])[0]
-                alt_sim = F.cosine_similarity(alt_q_emb.unsqueeze(0), q_emb.unsqueeze(0)).item()
-                if alt_sim > best_alt_score:
-                    best_alt_score = alt_sim
-                    best_alternative = alt_anchor
-            if best_alternative:
-                # Create alternative suggestion with the better anchor
-                alt_rewritten, alt_exact = inject_anchor_into_sentence(best_sent, best_alternative, target_url)
-                result["alternative_anchor"] = best_alternative
                 result["alternative_sentence"] = alt_rewritten
                 result["alternative_exact_match"] = alt_exact
@@ -531,18 +601,23 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
     # Process alternative anchor if requested and original anchor not found
     if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
         alt_anchor = res["alternative_anchor"]
         alt_sentence = res["alternative_sentence"]
         # Apply GPT rewriting to alternative as well
         if smart_rewrite:
-            alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=language_name)
             alt_final = alt_g["sentence_html"]
         else:
             alt_final = alt_sentence
         # Polish if needed
         if not res.get("alternative_exact_match", False):
-            alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=language_name)
             alt_final = alt_polished.get("sentence_html", alt_final)
         alt_output = to_plain_text(alt_final) if plain_text else alt_final
@@ -551,7 +626,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
         result += f"💡 OPTION 2 - Better anchor suggestion:\n\n"
         result += f"Since '{anchor_text}' is not in the article, consider using:\n"
         result += f"Suggested anchor: '{alt_anchor}'\n\n"
-        result += f"Change this sentence:\n{original_sentence}\n\nWith this one:\n{alt_output}"
     return result

     ]
     return suggestions[0]
+def find_alternative_anchor(blocks, target_url, original_anchor):
+    """Find a better anchor text from the article that relates to the target URL."""
+    # Get target page context
+    try:
+        tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+        soup = BeautifulSoup(tgt_html, "html.parser")
+        # Extract target page title and meta description
+        title = soup.title.get_text().strip() if soup.title else ""
+        meta_desc = ""
+        meta_tag = soup.find("meta", attrs={"name": "description"})
+        if meta_tag:
+            meta_desc = meta_tag.get("content", "")
+        # Extract key terms from target page (first few paragraphs)
+        target_paragraphs = []
+        for p in soup.find_all("p")[:5]:
+            text = p.get_text().strip()
+            if len(text) > 50:
+                target_paragraphs.append(text)
+        target_content = " ".join(target_paragraphs[:3])
+    except Exception as e:
+        print(f"Error fetching target URL: {e}")
+        title = ""
+        meta_desc = ""
+        target_content = original_anchor
+    # Extract all potential anchor phrases from the source article
+    all_phrases = set()
+    full_text = " ".join(blocks)
+    # Common words to exclude
     stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                  'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
+                 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+                 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
+                 'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
+    # Extract noun phrases and important terms (2-4 words)
+    sentences = re.split(r'[.!?]', full_text)
+    for sentence in sentences:
+        words = sentence.split()
+        # Extract phrases of 2-4 words
+        for length in range(2, min(5, len(words) + 1)):
+            for i in range(len(words) - length + 1):
+                phrase = ' '.join(words[i:i+length])
+                phrase_clean = phrase.strip('.,!?;:"\' ')
+                # Check if phrase is meaningful
+                first_word = words[i].lower().strip('.,!?;:')
+                last_word = words[i+length-1].lower().strip('.,!?;:')
+                # Skip if starts/ends with stopwords or is too short
+                if (first_word not in stopwords and
+                    last_word not in stopwords and
+                    len(phrase_clean) > 5 and
+                    len(phrase_clean) < 50):
+                    all_phrases.add(phrase_clean)
+        # Also extract single important words (proper nouns, long words)
+        for word in words:
+            clean_word = word.strip('.,!?;:"\' ')
+            if (len(clean_word) > 6 or
+                (clean_word[0].isupper() and clean_word.lower() not in stopwords)):
+                all_phrases.add(clean_word)
+    if not all_phrases:
+        return None, None
+    # Create context query from target URL info
+    target_context = f"{title} {meta_desc} {target_content}"[:500]
+    # Score each phrase based on relevance to target
+    target_emb = embed([target_context])[0]
+    best_anchor = None
+    best_score = -1
+    best_sentence = None
+    # Evaluate each potential anchor
+    for phrase in all_phrases:
+        # Skip if too similar to original anchor (we want something different)
+        if phrase.lower() == original_anchor.lower():
+            continue
+        # Score this phrase against target context
+        phrase_emb = embed([phrase])[0]
+        relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+        # Check if this phrase appears in article and find its best context
+        if phrase.lower() in full_text.lower():
+            # Find sentences containing this phrase
+            for block in blocks:
+                if phrase.lower() in block.lower():
+                    sents = re.split(r'(?<=[.!?])\s+', block)
+                    for sent in sents:
+                        if phrase.lower() in sent.lower():
+                            # Score this sentence-phrase combination
+                            sent_emb = embed([sent])[0]
+                            context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+                            combined_score = (relevance_score * 0.6) + (context_score * 0.4)
+                            if combined_score > best_score:
+                                best_score = combined_score
+                                best_anchor = phrase
+                                best_sentence = sent
+    return best_anchor, best_sentence
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
     blocks = get_text_blocks(source_url)
         # If anchor not present and alternative suggestion requested
         if suggest_alternative and not keyword_present:
+            # Find a completely different anchor and sentence
+            alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
+            if alt_anchor and alt_sentence:
+                # Create the sentence with the alternative anchor
+                alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
+                result["alternative_anchor"] = alt_anchor
+                result["alternative_sentence_original"] = alt_sentence
                 result["alternative_sentence"] = alt_rewritten
                 result["alternative_exact_match"] = alt_exact
     # Process alternative anchor if requested and original anchor not found
     if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
         alt_anchor = res["alternative_anchor"]
+        alt_sentence_original = res.get("alternative_sentence_original", res["best_sentence_original"])
         alt_sentence = res["alternative_sentence"]
+        # Detect language for alternative sentence
+        alt_detected_lang = detect_language(alt_sentence_original)
+        alt_language_name = get_language_name(alt_detected_lang)
         # Apply GPT rewriting to alternative as well
         if smart_rewrite:
+            alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
             alt_final = alt_g["sentence_html"]
         else:
             alt_final = alt_sentence
         # Polish if needed
         if not res.get("alternative_exact_match", False):
+            alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
             alt_final = alt_polished.get("sentence_html", alt_final)
         alt_output = to_plain_text(alt_final) if plain_text else alt_final
         result += f"💡 OPTION 2 - Better anchor suggestion:\n\n"
         result += f"Since '{anchor_text}' is not in the article, consider using:\n"
         result += f"Suggested anchor: '{alt_anchor}'\n\n"
+        result += f"Change this sentence:\n{alt_sentence_original}\n\nWith this one:\n{alt_output}"
     return result