Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Build error

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

8263900

verified ·

1 Parent(s): a072005

Update app.py

Browse files

Files changed (1) hide show

app.py +230 -168

app.py CHANGED Viewed

@@ -298,191 +298,253 @@ def create_anchor_suggestion(anchor_text, target_url):
 def find_alternative_anchor(blocks, target_url, original_anchor):
     """Find a better anchor text from the article that relates to the target URL."""
-    # Get target page context
     try:
-        tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-        soup = BeautifulSoup(tgt_html, "html.parser")
-        # Extract target page title and meta description
-        title = soup.title.get_text().strip() if soup.title else ""
-        meta_desc = ""
-        meta_tag = soup.find("meta", attrs={"name": "description"})
-        if meta_tag:
-            meta_desc = meta_tag.get("content", "")
-        # Extract key terms from target page (first few paragraphs)
-        target_paragraphs = []
-        for p in soup.find_all("p")[:5]:
-            text = p.get_text().strip()
-            if len(text) > 50:
-                target_paragraphs.append(text)
-        target_content = " ".join(target_paragraphs[:3])
-    except Exception as e:
-        print(f"Error fetching target URL: {e}")
-        title = ""
-        meta_desc = ""
-        target_content = original_anchor
-    # Extract all potential anchor phrases from the source article
-    all_phrases = set()
-    full_text = " ".join(blocks)
-    # Common words to exclude
-    stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
-                 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
-                 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
-                 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
-                 'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
-    # Extract noun phrases and important terms (2-4 words)
-    sentences = re.split(r'[.!?]', full_text)
-    for sentence in sentences:
-        words = sentence.split()
-        # Extract phrases of 2-4 words
-        for length in range(2, min(5, len(words) + 1)):
-            for i in range(len(words) - length + 1):
-                phrase = ' '.join(words[i:i+length])
-                phrase_clean = phrase.strip('.,!?;:"\' ')
-                # Check if phrase is meaningful
-                first_word = words[i].lower().strip('.,!?;:')
-                last_word = words[i+length-1].lower().strip('.,!?;:')
-                # Skip if starts/ends with stopwords or is too short
-                if (first_word not in stopwords and
-                    last_word not in stopwords and
-                    len(phrase_clean) > 5 and
-                    len(phrase_clean) < 50):
-                    all_phrases.add(phrase_clean)
-        # Also extract single important words (proper nouns, long words)
-        for word in words:
-            clean_word = word.strip('.,!?;:"\' ')
-            if (len(clean_word) > 6 or
-                (clean_word[0].isupper() and clean_word.lower() not in stopwords)):
-                all_phrases.add(clean_word)
-    if not all_phrases:
-        return None, None
-    # Create context query from target URL info
-    target_context = f"{title} {meta_desc} {target_content}"[:500]
-    # Score each phrase based on relevance to target
-    target_emb = embed([target_context])[0]
-    best_anchor = None
-    best_score = -1
-    best_sentence = None
-    # Evaluate each potential anchor
-    for phrase in all_phrases:
-        # Skip if too similar to original anchor (we want something different)
-        if phrase.lower() == original_anchor.lower():
-            continue
-        # Score this phrase against target context
-        phrase_emb = embed([phrase])[0]
-        relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
-        # Check if this phrase appears in article and find its best context
-        if phrase.lower() in full_text.lower():
-            # Find sentences containing this phrase
-            for block in blocks:
-                if phrase.lower() in block.lower():
-                    sents = re.split(r'(?<=[.!?])\s+', block)
-                    for sent in sents:
-                        if phrase.lower() in sent.lower():
-                            # Score this sentence-phrase combination
-                            sent_emb = embed([sent])[0]
-                            context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
-                            combined_score = (relevance_score * 0.6) + (context_score * 0.4)
-                            if combined_score > best_score:
-                                best_score = combined_score
-                                best_anchor = phrase
-                                best_sentence = sent
-    return best_anchor, best_sentence
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
-    blocks = get_text_blocks(source_url)
-    if not blocks:
-        return [{"error":"No text blocks found on the page."}]
-    # Check if keyword is present in the article
-    full_text = " ".join(blocks).lower()
-    keyword_present = anchor_text.lower() in full_text
-    # target context
-    try:
-        tgt_html = requests.get(target_url, timeout=20, headers=UA).text
-        tt = BeautifulSoup(tgt_html, "html.parser").title
-        tgt_title = tt.get_text().strip() if tt else ""
-    except Exception:
-        tgt_title = ""
-    ext = tldextract.extract(target_url)
-    tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
-    # First, find best match with original anchor
-    query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
-    q_emb = embed([query])[0]
-    blk_embs = embed(blocks)
-    sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
-    top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
-    results = []
-    for idx in top_idx:
-        blk = blocks[idx]
-        # Split sentences more carefully
-        sents = re.split(r'(?<=[.!?])\s+', blk)
-        # Filter out empty sentences
-        sents = [s for s in sents if s and len(s.strip()) > 0]
-        if not sents:
-            # If no valid sentences, use the whole block
-            sents = [blk]
         try:
-            s_embs = embed(sents)
-            s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
-            si = int(torch.argmax(s_sims))
-            best_sent = sents[min(si, len(sents)-1)]  # Ensure index is valid
         except Exception as e:
-            print(f"Error in sentence embedding: {e}")
-            # Fallback to first sentence
-            best_sent = sents[0] if sents else blk
-        rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
-        result = {
-            "anchor_was_present": exact_found,
-            "best_sentence_original": best_sent,
-            "best_sentence_with_anchor": rewritten_sent,
-            "keyword_in_article": keyword_present
-        }
-        # If anchor not present and alternative suggestion requested
-        if suggest_alternative and not keyword_present:
-            # Find a completely different anchor and sentence
-            alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
-            if alt_anchor and alt_sentence:
-                # Create the sentence with the alternative anchor
-                alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
-                result["alternative_anchor"] = alt_anchor
-                result["alternative_sentence_original"] = alt_sentence
-                result["alternative_sentence"] = alt_rewritten
-                result["alternative_exact_match"] = alt_exact
-        results.append(result)
-    return results
 # =========================
 # OpenAI helpers with caching

 def find_alternative_anchor(blocks, target_url, original_anchor):
     """Find a better anchor text from the article that relates to the target URL."""
     try:
+        # Get target page context
+        try:
+            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+            soup = BeautifulSoup(tgt_html, "html.parser")
+            # Extract target page title and meta description
+            title = soup.title.get_text().strip() if soup.title else ""
+            meta_desc = ""
+            meta_tag = soup.find("meta", attrs={"name": "description"})
+            if meta_tag:
+                meta_desc = meta_tag.get("content", "")
+            # Extract key terms from target page (first few paragraphs)
+            target_paragraphs = []
+            for p in soup.find_all("p")[:5]:
+                text = p.get_text().strip()
+                if len(text) > 50:
+                    target_paragraphs.append(text)
+            target_content = " ".join(target_paragraphs[:3])
+        except Exception as e:
+            print(f"Error fetching target URL: {e}")
+            title = ""
+            meta_desc = ""
+            target_content = original_anchor
+        # Extract all potential anchor phrases from the source article
+        all_phrases = set()
+        full_text = " ".join(blocks)
+        # Common words to exclude
+        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+                     'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
+                     'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+                     'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
+                     'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
+        # Extract noun phrases and important terms (2-4 words)
+        sentences = re.split(r'[.!?]', full_text)
+        for sentence in sentences:
+            if not sentence:
+                continue
+            words = sentence.split()
+            # Extract phrases of 2-4 words
+            for length in range(2, min(5, len(words) + 1)):
+                for i in range(len(words) - length + 1):
+                    if i < 0 or i+length > len(words):
+                        continue
+                    phrase = ' '.join(words[i:i+length])
+                    phrase_clean = phrase.strip('.,!?;:"\' ')
+                    # Check if phrase is meaningful
+                    if i < len(words) and i+length-1 < len(words):
+                        first_word = words[i].lower().strip('.,!?;:')
+                        last_word = words[i+length-1].lower().strip('.,!?;:')
+                        # Skip if starts/ends with stopwords or is too short
+                        if (first_word not in stopwords and
+                            last_word not in stopwords and
+                            len(phrase_clean) > 5 and
+                            len(phrase_clean) < 50):
+                            all_phrases.add(phrase_clean)
+            # Also extract single important words (proper nouns, long words)
+            for word in words:
+                clean_word = word.strip('.,!?;:"\' ')
+                if clean_word and (len(clean_word) > 6 or
+                    (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
+                    all_phrases.add(clean_word)
+        if not all_phrases:
+            return None, None
+        # Create context query from target URL info
+        target_context = f"{title} {meta_desc} {target_content}"[:500]
+        # Score each phrase based on relevance to target
+        try:
+            target_emb = embed([target_context])[0]
+        except:
+            return None, None
+        best_anchor = None
+        best_score = -1
+        best_sentence = None
+        # Evaluate each potential anchor
+        for phrase in list(all_phrases)[:50]:  # Limit to first 50 to avoid too much processing
+            # Skip if too similar to original anchor (we want something different)
+            if phrase.lower() == original_anchor.lower():
+                continue
+            try:
+                # Score this phrase against target context
+                phrase_emb = embed([phrase])[0]
+                relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+                # Check if this phrase appears in article and find its best context
+                if phrase.lower() in full_text.lower():
+                    # Find sentences containing this phrase
+                    for block in blocks:
+                        if phrase.lower() in block.lower():
+                            sents = re.split(r'(?<=[.!?])\s+', block)
+                            for sent in sents:
+                                if sent and phrase.lower() in sent.lower():
+                                    # Score this sentence-phrase combination
+                                    try:
+                                        sent_emb = embed([sent])[0]
+                                        context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
+                                        combined_score = (relevance_score * 0.6) + (context_score * 0.4)
+                                        if combined_score > best_score:
+                                            best_score = combined_score
+                                            best_anchor = phrase
+                                            best_sentence = sent
+                                    except:
+                                        continue
+            except Exception as e:
+                print(f"Error evaluating phrase '{phrase}': {e}")
+                continue
+        return best_anchor, best_sentence
+    except Exception as e:
+        print(f"Critical error in find_alternative_anchor: {e}")
+        return None, None
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
+    try:
+        blocks = get_text_blocks(source_url)
+        if not blocks:
+            return [{"error":"No text blocks found on the page."}]
+        # Check if keyword is present in the article
+        full_text = " ".join(blocks).lower()
+        keyword_present = anchor_text.lower() in full_text if anchor_text else False
+        # target context
         try:
+            tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+            tt = BeautifulSoup(tgt_html, "html.parser").title
+            tgt_title = tt.get_text().strip() if tt else ""
         except Exception as e:
+            print(f"Error fetching target URL: {e}")
+            tgt_title = ""
+        ext = tldextract.extract(target_url)
+        tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+        # First, find best match with original anchor
+        query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
+        try:
+            q_emb = embed([query])[0]
+            blk_embs = embed(blocks)
+            sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
+            top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
+        except Exception as e:
+            print(f"Error in block embedding/similarity: {e}")
+            # Fallback to first block
+            top_idx = [0]
+        results = []
+        for idx in top_idx:
+            try:
+                blk = blocks[min(idx, len(blocks)-1)]  # Ensure valid index
+                # Split sentences more carefully
+                sents = re.split(r'(?<=[.!?])\s+', blk)
+                # Filter out empty sentences and ensure they have content
+                sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
+                if not sents:
+                    # If no valid sentences, use the whole block
+                    sents = [blk]
+                best_sent = sents[0]  # Default to first sentence
+                try:
+                    # Only try embedding if we have valid sentences
+                    if len(sents) > 0 and all(len(s) > 0 for s in sents):
+                        s_embs = embed(sents)
+                        s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
+                        si = int(torch.argmax(s_sims).item())  # Use .item() to get scalar
+                        if 0 <= si < len(sents):
+                            best_sent = sents[si]
+                except Exception as e:
+                    print(f"Error in sentence selection: {e}, using first sentence")
+                    # Keep default (first sentence)
+                # Ensure best_sent is valid before processing
+                if not best_sent or len(best_sent.strip()) == 0:
+                    best_sent = blk if blk else "Unable to extract sentence from this section."
+                rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
+                result = {
+                    "anchor_was_present": exact_found,
+                    "best_sentence_original": best_sent,
+                    "best_sentence_with_anchor": rewritten_sent,
+                    "keyword_in_article": keyword_present
+                }
+                # If anchor not present and alternative suggestion requested
+                if suggest_alternative and not keyword_present:
+                    try:
+                        # Find a completely different anchor and sentence
+                        alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
+                        if alt_anchor and alt_sentence:
+                            # Create the sentence with the alternative anchor
+                            alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
+                            result["alternative_anchor"] = alt_anchor
+                            result["alternative_sentence_original"] = alt_sentence
+                            result["alternative_sentence"] = alt_rewritten
+                            result["alternative_exact_match"] = alt_exact
+                    except Exception as e:
+                        print(f"Error finding alternative anchor: {e}")
+                        # Continue without alternative
+                results.append(result)
+            except Exception as e:
+                print(f"Error processing block {idx}: {e}")
+                # Add a fallback result
+                results.append({
+                    "anchor_was_present": False,
+                    "best_sentence_original": blocks[0] if blocks else "Error extracting content",
+                    "best_sentence_with_anchor": f"Error processing content. Please try adding the link manually: <a href='{target_url}'>{anchor_text}</a>",
+                    "keyword_in_article": keyword_present
+                })
+        return results
+    except Exception as e:
+        print(f"Critical error in suggest_insertions: {e}")
+        import traceback
+        traceback.print_exc()
+        return [{
+            "error": f"Error processing the page: {str(e)}",
+            "anchor_was_present": False,
+            "best_sentence_original": "Error occurred",
+            "best_sentence_with_anchor": f"Error occurred. Try manually: <a href='{target_url}'>{anchor_text}</a>",
+            "keyword_in_article": False
+        }]
 # =========================
 # OpenAI helpers with caching