Update app.py
Browse files
app.py
CHANGED
|
@@ -137,9 +137,8 @@ def is_likely_author_bio_or_footer(element, text):
|
|
| 137 |
return False
|
| 138 |
|
| 139 |
def get_text_blocks(url, max_paragraphs=7):
|
| 140 |
-
"""Extract text blocks
|
| 141 |
try:
|
| 142 |
-
# Try with a more complete User-Agent that mimics a real browser
|
| 143 |
headers = {
|
| 144 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 145 |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
@@ -152,13 +151,12 @@ def get_text_blocks(url, max_paragraphs=7):
|
|
| 152 |
|
| 153 |
resp = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
|
| 154 |
resp.raise_for_status()
|
| 155 |
-
# Ensure proper encoding
|
| 156 |
resp.encoding = resp.apparent_encoding
|
| 157 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 158 |
except Exception as e:
|
| 159 |
print(f"Error fetching URL {url}: {e}")
|
| 160 |
-
# Try one more time with just the basic User-Agent
|
| 161 |
try:
|
|
|
|
| 162 |
resp = requests.get(url, timeout=20, headers=UA)
|
| 163 |
resp.raise_for_status()
|
| 164 |
resp.encoding = resp.apparent_encoding
|
|
@@ -171,63 +169,88 @@ def get_text_blocks(url, max_paragraphs=7):
|
|
| 171 |
tag.decompose()
|
| 172 |
|
| 173 |
blocks = []
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
# Try to find main content area -
|
| 177 |
-
main_content =
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
if not main_content:
|
| 187 |
main_content = soup.body if soup.body else soup
|
| 188 |
|
| 189 |
-
#
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
continue
|
| 196 |
-
|
| 197 |
-
txt = " ".join(el.get_text(" ", strip=True).split())
|
| 198 |
|
| 199 |
-
# For divs,
|
| 200 |
-
if
|
| 201 |
-
|
| 202 |
-
if el.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section']):
|
| 203 |
-
continue
|
| 204 |
-
# Only include divs with substantial text content
|
| 205 |
-
if len(txt) < 100:
|
| 206 |
continue
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
#
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
return blocks
|
| 233 |
|
|
@@ -287,15 +310,6 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
|
|
| 287 |
rewritten = f'{base}{clause}{punct}'
|
| 288 |
return rewritten, False
|
| 289 |
|
| 290 |
-
def create_anchor_suggestion(anchor_text, target_url):
|
| 291 |
-
"""Create a suggestion for where to add the anchor when it's not found in the article."""
|
| 292 |
-
suggestions = [
|
| 293 |
-
f'Consider adding a new sentence like: "For more information on this topic, see <a href="{target_url}">{anchor_text}</a>."',
|
| 294 |
-
f'You could add: "Additional insights can be found in <a href="{target_url}">{anchor_text}</a>."',
|
| 295 |
-
f'Suggestion: "This relates to concepts discussed in <a href="{target_url}">{anchor_text}</a>."'
|
| 296 |
-
]
|
| 297 |
-
return suggestions[0]
|
| 298 |
-
|
| 299 |
def find_alternative_anchor(blocks, target_url, original_anchor):
|
| 300 |
"""Find a better anchor text from the article that relates to the target URL."""
|
| 301 |
try:
|
|
@@ -433,21 +447,43 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 433 |
if not blocks:
|
| 434 |
return [{"error":"No text blocks found on the page."}]
|
| 435 |
|
| 436 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
full_text = " ".join(blocks)
|
| 438 |
full_text_lower = full_text.lower()
|
| 439 |
anchor_text_lower = anchor_text.lower() if anchor_text else ""
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
if not keyword_present:
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
-
#
|
| 451 |
try:
|
| 452 |
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 453 |
tt = BeautifulSoup(tgt_html, "html.parser").title
|
|
@@ -459,8 +495,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 459 |
ext = tldextract.extract(target_url)
|
| 460 |
tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
|
| 461 |
|
| 462 |
-
#
|
| 463 |
-
query = f"{anchor_text}
|
| 464 |
|
| 465 |
try:
|
| 466 |
q_emb = embed([query])[0]
|
|
@@ -475,7 +511,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 475 |
results = []
|
| 476 |
for idx in top_idx:
|
| 477 |
try:
|
| 478 |
-
|
|
|
|
| 479 |
|
| 480 |
# Split sentences more carefully
|
| 481 |
sents = re.split(r'(?<=[.!?])\s+', blk)
|
|
@@ -493,7 +530,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 493 |
if len(sents) > 0 and all(len(s) > 0 for s in sents):
|
| 494 |
s_embs = embed(sents)
|
| 495 |
s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
|
| 496 |
-
si = int(torch.argmax(s_sims).item())
|
| 497 |
if 0 <= si < len(sents):
|
| 498 |
best_sent = sents[si]
|
| 499 |
except Exception as e:
|
|
@@ -504,23 +541,23 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 504 |
if not best_sent or len(best_sent.strip()) == 0:
|
| 505 |
best_sent = blk if blk else "Unable to extract sentence from this section."
|
| 506 |
|
| 507 |
-
# Check if anchor is in THIS specific sentence
|
| 508 |
sentence_lower = best_sent.lower()
|
| 509 |
anchor_found_in_sentence = anchor_text_lower in sentence_lower
|
| 510 |
|
| 511 |
# If not found with exact match, try normalized
|
| 512 |
if not anchor_found_in_sentence:
|
| 513 |
-
normalized_sent = re.sub(r'
|
| 514 |
-
normalized_anchor = re.sub(r'
|
| 515 |
anchor_found_in_sentence = normalized_anchor in normalized_sent
|
| 516 |
|
| 517 |
rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
|
| 518 |
|
| 519 |
result = {
|
| 520 |
-
"anchor_was_present": anchor_found_in_sentence,
|
| 521 |
"best_sentence_original": best_sent,
|
| 522 |
"best_sentence_with_anchor": rewritten_sent,
|
| 523 |
-
"keyword_in_article": keyword_present
|
| 524 |
}
|
| 525 |
|
| 526 |
# If anchor not present in article and alternative suggestion requested
|
|
@@ -544,6 +581,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 544 |
|
| 545 |
except Exception as e:
|
| 546 |
print(f"Error processing block {idx}: {e}")
|
|
|
|
|
|
|
| 547 |
# Add a fallback result
|
| 548 |
results.append({
|
| 549 |
"anchor_was_present": False,
|
|
@@ -618,7 +657,7 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
|
|
| 618 |
"(2) Do NOT use an em dash or any dash. "
|
| 619 |
'(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
|
| 620 |
"Prefer integrating the anchor as part of the sentence. "
|
| 621 |
-
f"(4) Write in {language} and preserve ALL special characters (Δ,
|
| 622 |
"Return a compact JSON object with key sentence_html only."
|
| 623 |
)
|
| 624 |
|
|
@@ -711,7 +750,7 @@ def to_plain_text(html_or_text):
|
|
| 711 |
return html.unescape(text)
|
| 712 |
|
| 713 |
# =========================
|
| 714 |
-
# Gradio UI
|
| 715 |
# =========================
|
| 716 |
def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
|
| 717 |
if not source_url or not target_url or not anchor_text:
|
|
@@ -747,43 +786,55 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
|
|
| 747 |
|
| 748 |
# Check if anchor was already present in the article
|
| 749 |
anchor_was_present = res.get("anchor_was_present", False)
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
# If anchor is present
|
| 753 |
-
if
|
| 754 |
-
# Anchor exists
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
else:
|
| 761 |
-
# Anchor doesn't exist - need to add it
|
| 762 |
-
# 1) Optional first-pass rewrite with language support
|
| 763 |
if smart_rewrite:
|
| 764 |
g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
|
| 765 |
final_html = g["sentence_html"]
|
| 766 |
else:
|
| 767 |
final_html = draft_html
|
| 768 |
|
| 769 |
-
# 2) QA/polish pass with language support
|
| 770 |
polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
|
| 771 |
final_html = polished.get("sentence_html", final_html)
|
| 772 |
-
|
| 773 |
-
# 3) Optionally convert to plain text
|
| 774 |
final_output = to_plain_text(final_html) if plain_text else final_html
|
| 775 |
|
| 776 |
-
# Build the result for when anchor is NOT present
|
| 777 |
result = warn + f"β οΈ **Anchor text '{anchor_text}' not found in article**\n\n"
|
| 778 |
result += f"π Result 1 - Suggested placement:\n\n"
|
| 779 |
result += f"Original: {original_sentence}\n\n"
|
| 780 |
result += f"Suggested: {final_output}"
|
| 781 |
|
| 782 |
-
#
|
| 783 |
-
|
| 784 |
-
# 2. The original anchor was NOT found in the article
|
| 785 |
-
# 3. We have an alternative suggestion
|
| 786 |
-
if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
|
| 787 |
alt_anchor = res["alternative_anchor"]
|
| 788 |
alt_sentence_original = res.get("alternative_sentence_original", "")
|
| 789 |
alt_sentence = res.get("alternative_sentence", "")
|
|
@@ -844,7 +895,7 @@ with gr.Blocks(title=f"Link Insertion Helper β’ GPT: {gpt_status}") as demo:
|
|
| 844 |
plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
|
| 845 |
suggest_alternative_anchor = gr.Checkbox(
|
| 846 |
label="Suggest alternative anchor",
|
| 847 |
-
value=True,
|
| 848 |
info="If anchor not found, suggest a better anchor from the article"
|
| 849 |
)
|
| 850 |
|
|
@@ -869,7 +920,7 @@ with gr.Blocks(title=f"Link Insertion Helper β’ GPT: {gpt_status}") as demo:
|
|
| 869 |
|
| 870 |
gr.Markdown("""
|
| 871 |
### Features:
|
| 872 |
-
- π **Auto Language Detection**: Preserves special characters (Δ,
|
| 873 |
- πΎ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
|
| 874 |
- π― **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
|
| 875 |
- π **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text
|
|
|
|
| 137 |
return False
|
| 138 |
|
| 139 |
def get_text_blocks(url, max_paragraphs=7):
|
| 140 |
+
"""Extract text blocks with improved extraction that captures all content."""
|
| 141 |
try:
|
|
|
|
| 142 |
headers = {
|
| 143 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 144 |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
|
|
| 151 |
|
| 152 |
resp = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
|
| 153 |
resp.raise_for_status()
|
|
|
|
| 154 |
resp.encoding = resp.apparent_encoding
|
| 155 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 156 |
except Exception as e:
|
| 157 |
print(f"Error fetching URL {url}: {e}")
|
|
|
|
| 158 |
try:
|
| 159 |
+
# Fallback to simpler headers
|
| 160 |
resp = requests.get(url, timeout=20, headers=UA)
|
| 161 |
resp.raise_for_status()
|
| 162 |
resp.encoding = resp.apparent_encoding
|
|
|
|
| 169 |
tag.decompose()
|
| 170 |
|
| 171 |
blocks = []
|
| 172 |
+
seen_texts = set()
|
| 173 |
+
|
| 174 |
+
# Try to find main content area - be more inclusive
|
| 175 |
+
main_content = None
|
| 176 |
+
content_selectors = [
|
| 177 |
+
('main', {}),
|
| 178 |
+
('article', {}),
|
| 179 |
+
('div', {'class': re.compile('content|main|article|post|entry|blog|body|wrapper', re.I)}),
|
| 180 |
+
('div', {'id': re.compile('content|main|article|post|entry|body|wrapper', re.I)}),
|
| 181 |
+
('div', {'role': 'main'}),
|
| 182 |
+
('div', {'class': 'container'}),
|
| 183 |
+
]
|
| 184 |
+
|
| 185 |
+
for tag, attrs in content_selectors:
|
| 186 |
+
if attrs:
|
| 187 |
+
main_content = soup.find(tag, attrs)
|
| 188 |
+
else:
|
| 189 |
+
main_content = soup.find(tag)
|
| 190 |
+
if main_content:
|
| 191 |
+
break
|
| 192 |
|
| 193 |
if not main_content:
|
| 194 |
main_content = soup.body if soup.body else soup
|
| 195 |
|
| 196 |
+
# Method 1: Get ALL text from the main content area first
|
| 197 |
+
# This ensures we don't miss any content
|
| 198 |
+
full_text = main_content.get_text(separator="\n")
|
| 199 |
+
|
| 200 |
+
# Split by newlines and process
|
| 201 |
+
lines = full_text.split('\n')
|
| 202 |
+
temp_blocks = []
|
| 203 |
+
|
| 204 |
+
for line in lines:
|
| 205 |
+
clean_line = " ".join(line.strip().split())
|
| 206 |
+
if len(clean_line) > 60: # Only keep substantial lines
|
| 207 |
+
if clean_line not in seen_texts:
|
| 208 |
+
temp_blocks.append(clean_line)
|
| 209 |
+
seen_texts.add(clean_line)
|
| 210 |
+
|
| 211 |
+
# Method 2: Also get specific HTML elements for better structure
|
| 212 |
+
for element in main_content.find_all(['p', 'div', 'li', 'h1', 'h2', 'h3', 'h4', 'blockquote'], recursive=True):
|
| 213 |
+
# Skip if likely author bio or footer
|
| 214 |
+
if is_likely_author_bio_or_footer(element, element.get_text()):
|
| 215 |
continue
|
|
|
|
|
|
|
| 216 |
|
| 217 |
+
# For divs, skip if they contain other block elements (they're containers)
|
| 218 |
+
if element.name == 'div':
|
| 219 |
+
if element.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article', 'section']):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
continue
|
| 221 |
|
| 222 |
+
txt = " ".join(element.get_text(" ", strip=True).split())
|
| 223 |
+
|
| 224 |
+
# Add to blocks if substantial and not duplicate
|
| 225 |
+
if len(txt) > 60 and txt not in seen_texts:
|
| 226 |
+
blocks.append(txt)
|
| 227 |
+
seen_texts.add(txt)
|
| 228 |
+
|
| 229 |
+
# If we got blocks from method 2, use those (better structure)
|
| 230 |
+
# Otherwise, use the temp_blocks from method 1
|
| 231 |
+
if not blocks and temp_blocks:
|
| 232 |
+
blocks = temp_blocks[:max_paragraphs]
|
| 233 |
+
elif len(blocks) < max_paragraphs and temp_blocks:
|
| 234 |
+
# Combine both methods - add any unique blocks from temp_blocks
|
| 235 |
+
for tb in temp_blocks:
|
| 236 |
+
if tb not in seen_texts:
|
| 237 |
+
blocks.append(tb)
|
| 238 |
+
seen_texts.add(tb)
|
| 239 |
+
if len(blocks) >= max_paragraphs:
|
| 240 |
+
break
|
| 241 |
+
|
| 242 |
+
# Limit to max_paragraphs
|
| 243 |
+
blocks = blocks[:max_paragraphs]
|
| 244 |
+
|
| 245 |
+
# Debug output
|
| 246 |
+
print(f"\nExtracted {len(blocks)} blocks from {url}")
|
| 247 |
+
if blocks:
|
| 248 |
+
print(f"First block preview: {blocks[0][:200]}...")
|
| 249 |
+
# Check if we have reasonable content
|
| 250 |
+
full_extracted = " ".join(blocks)
|
| 251 |
+
print(f"Total extracted text length: {len(full_extracted)} chars")
|
| 252 |
+
else:
|
| 253 |
+
print("WARNING: No blocks extracted!")
|
| 254 |
|
| 255 |
return blocks
|
| 256 |
|
|
|
|
| 310 |
rewritten = f'{base}{clause}{punct}'
|
| 311 |
return rewritten, False
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
def find_alternative_anchor(blocks, target_url, original_anchor):
|
| 314 |
"""Find a better anchor text from the article that relates to the target URL."""
|
| 315 |
try:
|
|
|
|
| 447 |
if not blocks:
|
| 448 |
return [{"error":"No text blocks found on the page."}]
|
| 449 |
|
| 450 |
+
# DEBUG: Print what we extracted
|
| 451 |
+
print("\n" + "="*50)
|
| 452 |
+
print(f"DEBUG: Looking for anchor: '{anchor_text}'")
|
| 453 |
+
print("="*50)
|
| 454 |
+
|
| 455 |
+
# Check if keyword is present in the article
|
| 456 |
full_text = " ".join(blocks)
|
| 457 |
full_text_lower = full_text.lower()
|
| 458 |
anchor_text_lower = anchor_text.lower() if anchor_text else ""
|
| 459 |
+
|
| 460 |
+
# Multiple ways to check for the anchor
|
| 461 |
+
keyword_present = False
|
| 462 |
+
|
| 463 |
+
# Method 1: Direct case-insensitive search
|
| 464 |
+
if anchor_text_lower in full_text_lower:
|
| 465 |
+
keyword_present = True
|
| 466 |
+
print(f"Found anchor via direct search")
|
| 467 |
+
|
| 468 |
+
# Method 2: Normalized search (remove extra spaces)
|
| 469 |
if not keyword_present:
|
| 470 |
+
normalized_full = re.sub(r'\s+', ' ', full_text_lower)
|
| 471 |
+
normalized_anchor = re.sub(r'\s+', ' ', anchor_text_lower)
|
| 472 |
+
if normalized_anchor in normalized_full:
|
| 473 |
+
keyword_present = True
|
| 474 |
+
print(f"Found anchor via normalized search")
|
| 475 |
+
|
| 476 |
+
# Method 3: Check each block individually
|
| 477 |
+
if not keyword_present:
|
| 478 |
+
for i, block in enumerate(blocks):
|
| 479 |
+
if anchor_text_lower in block.lower():
|
| 480 |
+
keyword_present = True
|
| 481 |
+
print(f"Found anchor in block {i}: {block[:100]}...")
|
| 482 |
+
break
|
| 483 |
+
|
| 484 |
+
print(f"Keyword present in article: {keyword_present}")
|
| 485 |
|
| 486 |
+
# Target context for similarity matching
|
| 487 |
try:
|
| 488 |
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 489 |
tt = BeautifulSoup(tgt_html, "html.parser").title
|
|
|
|
| 495 |
ext = tldextract.extract(target_url)
|
| 496 |
tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
|
| 497 |
|
| 498 |
+
# Find best match with original anchor
|
| 499 |
+
query = f"{anchor_text} β relevant to: {tgt_title} ({tgt_domain})"
|
| 500 |
|
| 501 |
try:
|
| 502 |
q_emb = embed([query])[0]
|
|
|
|
| 511 |
results = []
|
| 512 |
for idx in top_idx:
|
| 513 |
try:
|
| 514 |
+
idx = min(idx, len(blocks)-1) # Ensure valid index
|
| 515 |
+
blk = blocks[idx]
|
| 516 |
|
| 517 |
# Split sentences more carefully
|
| 518 |
sents = re.split(r'(?<=[.!?])\s+', blk)
|
|
|
|
| 530 |
if len(sents) > 0 and all(len(s) > 0 for s in sents):
|
| 531 |
s_embs = embed(sents)
|
| 532 |
s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
|
| 533 |
+
si = int(torch.argmax(s_sims).item())
|
| 534 |
if 0 <= si < len(sents):
|
| 535 |
best_sent = sents[si]
|
| 536 |
except Exception as e:
|
|
|
|
| 541 |
if not best_sent or len(best_sent.strip()) == 0:
|
| 542 |
best_sent = blk if blk else "Unable to extract sentence from this section."
|
| 543 |
|
| 544 |
+
# Check if anchor is in THIS specific sentence
|
| 545 |
sentence_lower = best_sent.lower()
|
| 546 |
anchor_found_in_sentence = anchor_text_lower in sentence_lower
|
| 547 |
|
| 548 |
# If not found with exact match, try normalized
|
| 549 |
if not anchor_found_in_sentence:
|
| 550 |
+
normalized_sent = re.sub(r'\s+', ' ', sentence_lower)
|
| 551 |
+
normalized_anchor = re.sub(r'\s+', ' ', anchor_text_lower)
|
| 552 |
anchor_found_in_sentence = normalized_anchor in normalized_sent
|
| 553 |
|
| 554 |
rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
|
| 555 |
|
| 556 |
result = {
|
| 557 |
+
"anchor_was_present": anchor_found_in_sentence,
|
| 558 |
"best_sentence_original": best_sent,
|
| 559 |
"best_sentence_with_anchor": rewritten_sent,
|
| 560 |
+
"keyword_in_article": keyword_present
|
| 561 |
}
|
| 562 |
|
| 563 |
# If anchor not present in article and alternative suggestion requested
|
|
|
|
| 581 |
|
| 582 |
except Exception as e:
|
| 583 |
print(f"Error processing block {idx}: {e}")
|
| 584 |
+
import traceback
|
| 585 |
+
traceback.print_exc()
|
| 586 |
# Add a fallback result
|
| 587 |
results.append({
|
| 588 |
"anchor_was_present": False,
|
|
|
|
| 657 |
"(2) Do NOT use an em dash or any dash. "
|
| 658 |
'(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
|
| 659 |
"Prefer integrating the anchor as part of the sentence. "
|
| 660 |
+
f"(4) Write in {language} and preserve ALL special characters (Δ, Δ, Ε‘, ΕΎ, Δ, etc.). "
|
| 661 |
"Return a compact JSON object with key sentence_html only."
|
| 662 |
)
|
| 663 |
|
|
|
|
| 750 |
return html.unescape(text)
|
| 751 |
|
| 752 |
# =========================
|
| 753 |
+
# Gradio UI
|
| 754 |
# =========================
|
| 755 |
def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
|
| 756 |
if not source_url or not target_url or not anchor_text:
|
|
|
|
| 786 |
|
| 787 |
# Check if anchor was already present in the article
|
| 788 |
anchor_was_present = res.get("anchor_was_present", False)
|
| 789 |
+
keyword_in_article = res.get("keyword_in_article", False)
|
| 790 |
+
|
| 791 |
+
# If anchor is present in the article (even if not in the best sentence)
|
| 792 |
+
if keyword_in_article:
|
| 793 |
+
# Anchor exists somewhere in article
|
| 794 |
+
if anchor_was_present:
|
| 795 |
+
# Anchor is in the suggested sentence - just show where to add the link
|
| 796 |
+
final_output = to_plain_text(draft_html) if plain_text else draft_html
|
| 797 |
+
result = warn + f"β
**Anchor text '{anchor_text}' found in article!**\n\n"
|
| 798 |
+
result += f"π Add link here:\n\n"
|
| 799 |
+
result += f"Original: {original_sentence}\n\n"
|
| 800 |
+
result += f"With link: {final_output}"
|
| 801 |
+
else:
|
| 802 |
+
# Anchor is in article but not in this sentence - show this sentence as an option
|
| 803 |
+
# and note that the anchor exists elsewhere
|
| 804 |
+
if smart_rewrite:
|
| 805 |
+
g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
|
| 806 |
+
final_html = g["sentence_html"]
|
| 807 |
+
else:
|
| 808 |
+
final_html = draft_html
|
| 809 |
+
|
| 810 |
+
polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
|
| 811 |
+
final_html = polished.get("sentence_html", final_html)
|
| 812 |
+
final_output = to_plain_text(final_html) if plain_text else final_html
|
| 813 |
+
|
| 814 |
+
result = warn + f"β
**Anchor text '{anchor_text}' found in article!**\n\n"
|
| 815 |
+
result += f"π The anchor appears elsewhere in the article. Here's a contextually relevant placement:\n\n"
|
| 816 |
+
result += f"Original: {original_sentence}\n\n"
|
| 817 |
+
result += f"Suggested: {final_output}\n\n"
|
| 818 |
+
result += f"π‘ Note: You may want to search for '{anchor_text}' in the article to find where it naturally appears."
|
| 819 |
else:
|
| 820 |
+
# Anchor doesn't exist in article at all - need to add it
|
|
|
|
| 821 |
if smart_rewrite:
|
| 822 |
g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
|
| 823 |
final_html = g["sentence_html"]
|
| 824 |
else:
|
| 825 |
final_html = draft_html
|
| 826 |
|
|
|
|
| 827 |
polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
|
| 828 |
final_html = polished.get("sentence_html", final_html)
|
|
|
|
|
|
|
| 829 |
final_output = to_plain_text(final_html) if plain_text else final_html
|
| 830 |
|
|
|
|
| 831 |
result = warn + f"β οΈ **Anchor text '{anchor_text}' not found in article**\n\n"
|
| 832 |
result += f"π Result 1 - Suggested placement:\n\n"
|
| 833 |
result += f"Original: {original_sentence}\n\n"
|
| 834 |
result += f"Suggested: {final_output}"
|
| 835 |
|
| 836 |
+
# Show alternative if requested and available
|
| 837 |
+
if suggest_alternative_anchor and res.get("alternative_anchor"):
|
|
|
|
|
|
|
|
|
|
| 838 |
alt_anchor = res["alternative_anchor"]
|
| 839 |
alt_sentence_original = res.get("alternative_sentence_original", "")
|
| 840 |
alt_sentence = res.get("alternative_sentence", "")
|
|
|
|
| 895 |
plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
|
| 896 |
suggest_alternative_anchor = gr.Checkbox(
|
| 897 |
label="Suggest alternative anchor",
|
| 898 |
+
value=True,
|
| 899 |
info="If anchor not found, suggest a better anchor from the article"
|
| 900 |
)
|
| 901 |
|
|
|
|
| 920 |
|
| 921 |
gr.Markdown("""
|
| 922 |
### Features:
|
| 923 |
+
- π **Auto Language Detection**: Preserves special characters (Δ, Δ, Ε‘, ΕΎ, Δ, etc.)
|
| 924 |
- πΎ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
|
| 925 |
- π― **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
|
| 926 |
- π **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text
|