Update app.py
Browse files
app.py
CHANGED
|
@@ -298,191 +298,253 @@ def create_anchor_suggestion(anchor_text, target_url):
|
|
| 298 |
|
| 299 |
def find_alternative_anchor(blocks, target_url, original_anchor):
|
| 300 |
"""Find a better anchor text from the article that relates to the target URL."""
|
| 301 |
-
|
| 302 |
-
# Get target page context
|
| 303 |
try:
|
| 304 |
-
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
-
#
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
|
| 314 |
-
# Extract
|
| 315 |
-
|
| 316 |
-
for
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
title = ""
|
| 325 |
-
meta_desc = ""
|
| 326 |
-
target_content = original_anchor
|
| 327 |
-
|
| 328 |
-
# Extract all potential anchor phrases from the source article
|
| 329 |
-
all_phrases = set()
|
| 330 |
-
full_text = " ".join(blocks)
|
| 331 |
-
|
| 332 |
-
# Common words to exclude
|
| 333 |
-
stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 334 |
-
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
| 335 |
-
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
| 336 |
-
'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
|
| 337 |
-
'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
|
| 338 |
-
|
| 339 |
-
# Extract noun phrases and important terms (2-4 words)
|
| 340 |
-
sentences = re.split(r'[.!?]', full_text)
|
| 341 |
-
for sentence in sentences:
|
| 342 |
-
words = sentence.split()
|
| 343 |
|
| 344 |
-
#
|
| 345 |
-
|
| 346 |
-
for i in range(len(words) - length + 1):
|
| 347 |
-
phrase = ' '.join(words[i:i+length])
|
| 348 |
-
phrase_clean = phrase.strip('.,!?;:"\' ')
|
| 349 |
-
|
| 350 |
-
# Check if phrase is meaningful
|
| 351 |
-
first_word = words[i].lower().strip('.,!?;:')
|
| 352 |
-
last_word = words[i+length-1].lower().strip('.,!?;:')
|
| 353 |
-
|
| 354 |
-
# Skip if starts/ends with stopwords or is too short
|
| 355 |
-
if (first_word not in stopwords and
|
| 356 |
-
last_word not in stopwords and
|
| 357 |
-
len(phrase_clean) > 5 and
|
| 358 |
-
len(phrase_clean) < 50):
|
| 359 |
-
all_phrases.add(phrase_clean)
|
| 360 |
|
| 361 |
-
#
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
all_phrases.add(clean_word)
|
| 367 |
-
|
| 368 |
-
if not all_phrases:
|
| 369 |
-
return None, None
|
| 370 |
-
|
| 371 |
-
# Create context query from target URL info
|
| 372 |
-
target_context = f"{title} {meta_desc} {target_content}"[:500]
|
| 373 |
-
|
| 374 |
-
# Score each phrase based on relevance to target
|
| 375 |
-
target_emb = embed([target_context])[0]
|
| 376 |
-
|
| 377 |
-
best_anchor = None
|
| 378 |
-
best_score = -1
|
| 379 |
-
best_sentence = None
|
| 380 |
-
|
| 381 |
-
# Evaluate each potential anchor
|
| 382 |
-
for phrase in all_phrases:
|
| 383 |
-
# Skip if too similar to original anchor (we want something different)
|
| 384 |
-
if phrase.lower() == original_anchor.lower():
|
| 385 |
-
continue
|
| 386 |
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
|
| 391 |
-
#
|
| 392 |
-
|
| 393 |
-
#
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
|
|
|
| 415 |
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
|
| 420 |
-
|
| 421 |
-
try:
|
| 422 |
-
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 423 |
-
tt = BeautifulSoup(tgt_html, "html.parser").title
|
| 424 |
-
tgt_title = tt.get_text().strip() if tt else ""
|
| 425 |
-
except Exception:
|
| 426 |
-
tgt_title = ""
|
| 427 |
-
|
| 428 |
-
ext = tldextract.extract(target_url)
|
| 429 |
-
tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
|
| 430 |
-
|
| 431 |
-
# First, find best match with original anchor
|
| 432 |
-
query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
|
| 433 |
-
q_emb = embed([query])[0]
|
| 434 |
-
|
| 435 |
-
blk_embs = embed(blocks)
|
| 436 |
-
sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
|
| 437 |
-
top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
|
| 438 |
-
|
| 439 |
-
results = []
|
| 440 |
-
for idx in top_idx:
|
| 441 |
-
blk = blocks[idx]
|
| 442 |
-
# Split sentences more carefully
|
| 443 |
-
sents = re.split(r'(?<=[.!?])\s+', blk)
|
| 444 |
-
# Filter out empty sentences
|
| 445 |
-
sents = [s for s in sents if s and len(s.strip()) > 0]
|
| 446 |
-
|
| 447 |
-
if not sents:
|
| 448 |
-
# If no valid sentences, use the whole block
|
| 449 |
-
sents = [blk]
|
| 450 |
-
|
| 451 |
try:
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
best_sent = sents[min(si, len(sents)-1)] # Ensure index is valid
|
| 456 |
except Exception as e:
|
| 457 |
-
print(f"Error
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
-
|
| 471 |
-
if suggest_alternative and not keyword_present:
|
| 472 |
-
# Find a completely different anchor and sentence
|
| 473 |
-
alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
|
| 474 |
-
|
| 475 |
-
if alt_anchor and alt_sentence:
|
| 476 |
-
# Create the sentence with the alternative anchor
|
| 477 |
-
alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
|
| 478 |
-
result["alternative_anchor"] = alt_anchor
|
| 479 |
-
result["alternative_sentence_original"] = alt_sentence
|
| 480 |
-
result["alternative_sentence"] = alt_rewritten
|
| 481 |
-
result["alternative_exact_match"] = alt_exact
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
# =========================
|
| 488 |
# OpenAI helpers with caching
|
|
|
|
| 298 |
|
| 299 |
def find_alternative_anchor(blocks, target_url, original_anchor):
|
| 300 |
"""Find a better anchor text from the article that relates to the target URL."""
|
|
|
|
|
|
|
| 301 |
try:
|
| 302 |
+
# Get target page context
|
| 303 |
+
try:
|
| 304 |
+
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 305 |
+
soup = BeautifulSoup(tgt_html, "html.parser")
|
| 306 |
+
|
| 307 |
+
# Extract target page title and meta description
|
| 308 |
+
title = soup.title.get_text().strip() if soup.title else ""
|
| 309 |
+
meta_desc = ""
|
| 310 |
+
meta_tag = soup.find("meta", attrs={"name": "description"})
|
| 311 |
+
if meta_tag:
|
| 312 |
+
meta_desc = meta_tag.get("content", "")
|
| 313 |
+
|
| 314 |
+
# Extract key terms from target page (first few paragraphs)
|
| 315 |
+
target_paragraphs = []
|
| 316 |
+
for p in soup.find_all("p")[:5]:
|
| 317 |
+
text = p.get_text().strip()
|
| 318 |
+
if len(text) > 50:
|
| 319 |
+
target_paragraphs.append(text)
|
| 320 |
+
target_content = " ".join(target_paragraphs[:3])
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
print(f"Error fetching target URL: {e}")
|
| 324 |
+
title = ""
|
| 325 |
+
meta_desc = ""
|
| 326 |
+
target_content = original_anchor
|
| 327 |
+
|
| 328 |
+
# Extract all potential anchor phrases from the source article
|
| 329 |
+
all_phrases = set()
|
| 330 |
+
full_text = " ".join(blocks)
|
| 331 |
|
| 332 |
+
# Common words to exclude
|
| 333 |
+
stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 334 |
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
| 335 |
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
| 336 |
+
'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
|
| 337 |
+
'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
|
| 338 |
|
| 339 |
+
# Extract noun phrases and important terms (2-4 words)
|
| 340 |
+
sentences = re.split(r'[.!?]', full_text)
|
| 341 |
+
for sentence in sentences:
|
| 342 |
+
if not sentence:
|
| 343 |
+
continue
|
| 344 |
+
words = sentence.split()
|
| 345 |
+
|
| 346 |
+
# Extract phrases of 2-4 words
|
| 347 |
+
for length in range(2, min(5, len(words) + 1)):
|
| 348 |
+
for i in range(len(words) - length + 1):
|
| 349 |
+
if i < 0 or i+length > len(words):
|
| 350 |
+
continue
|
| 351 |
+
phrase = ' '.join(words[i:i+length])
|
| 352 |
+
phrase_clean = phrase.strip('.,!?;:"\' ')
|
| 353 |
+
|
| 354 |
+
# Check if phrase is meaningful
|
| 355 |
+
if i < len(words) and i+length-1 < len(words):
|
| 356 |
+
first_word = words[i].lower().strip('.,!?;:')
|
| 357 |
+
last_word = words[i+length-1].lower().strip('.,!?;:')
|
| 358 |
+
|
| 359 |
+
# Skip if starts/ends with stopwords or is too short
|
| 360 |
+
if (first_word not in stopwords and
|
| 361 |
+
last_word not in stopwords and
|
| 362 |
+
len(phrase_clean) > 5 and
|
| 363 |
+
len(phrase_clean) < 50):
|
| 364 |
+
all_phrases.add(phrase_clean)
|
| 365 |
+
|
| 366 |
+
# Also extract single important words (proper nouns, long words)
|
| 367 |
+
for word in words:
|
| 368 |
+
clean_word = word.strip('.,!?;:"\' ')
|
| 369 |
+
if clean_word and (len(clean_word) > 6 or
|
| 370 |
+
(len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
|
| 371 |
+
all_phrases.add(clean_word)
|
| 372 |
|
| 373 |
+
if not all_phrases:
|
| 374 |
+
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
+
# Create context query from target URL info
|
| 377 |
+
target_context = f"{title} {meta_desc} {target_content}"[:500]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
+
# Score each phrase based on relevance to target
|
| 380 |
+
try:
|
| 381 |
+
target_emb = embed([target_context])[0]
|
| 382 |
+
except:
|
| 383 |
+
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
+
best_anchor = None
|
| 386 |
+
best_score = -1
|
| 387 |
+
best_sentence = None
|
| 388 |
|
| 389 |
+
# Evaluate each potential anchor
|
| 390 |
+
for phrase in list(all_phrases)[:50]: # Limit to first 50 to avoid too much processing
|
| 391 |
+
# Skip if too similar to original anchor (we want something different)
|
| 392 |
+
if phrase.lower() == original_anchor.lower():
|
| 393 |
+
continue
|
| 394 |
+
|
| 395 |
+
try:
|
| 396 |
+
# Score this phrase against target context
|
| 397 |
+
phrase_emb = embed([phrase])[0]
|
| 398 |
+
relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
|
| 399 |
+
|
| 400 |
+
# Check if this phrase appears in article and find its best context
|
| 401 |
+
if phrase.lower() in full_text.lower():
|
| 402 |
+
# Find sentences containing this phrase
|
| 403 |
+
for block in blocks:
|
| 404 |
+
if phrase.lower() in block.lower():
|
| 405 |
+
sents = re.split(r'(?<=[.!?])\s+', block)
|
| 406 |
+
for sent in sents:
|
| 407 |
+
if sent and phrase.lower() in sent.lower():
|
| 408 |
+
# Score this sentence-phrase combination
|
| 409 |
+
try:
|
| 410 |
+
sent_emb = embed([sent])[0]
|
| 411 |
+
context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
|
| 412 |
+
combined_score = (relevance_score * 0.6) + (context_score * 0.4)
|
| 413 |
+
|
| 414 |
+
if combined_score > best_score:
|
| 415 |
+
best_score = combined_score
|
| 416 |
+
best_anchor = phrase
|
| 417 |
+
best_sentence = sent
|
| 418 |
+
except:
|
| 419 |
+
continue
|
| 420 |
+
except Exception as e:
|
| 421 |
+
print(f"Error evaluating phrase '{phrase}': {e}")
|
| 422 |
+
continue
|
| 423 |
+
|
| 424 |
+
return best_anchor, best_sentence
|
| 425 |
+
|
| 426 |
+
except Exception as e:
|
| 427 |
+
print(f"Critical error in find_alternative_anchor: {e}")
|
| 428 |
+
return None, None
|
| 429 |
|
| 430 |
def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
|
| 431 |
+
try:
|
| 432 |
+
blocks = get_text_blocks(source_url)
|
| 433 |
+
if not blocks:
|
| 434 |
+
return [{"error":"No text blocks found on the page."}]
|
| 435 |
|
| 436 |
+
# Check if keyword is present in the article
|
| 437 |
+
full_text = " ".join(blocks).lower()
|
| 438 |
+
keyword_present = anchor_text.lower() in full_text if anchor_text else False
|
| 439 |
|
| 440 |
+
# target context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
try:
|
| 442 |
+
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 443 |
+
tt = BeautifulSoup(tgt_html, "html.parser").title
|
| 444 |
+
tgt_title = tt.get_text().strip() if tt else ""
|
|
|
|
| 445 |
except Exception as e:
|
| 446 |
+
print(f"Error fetching target URL: {e}")
|
| 447 |
+
tgt_title = ""
|
| 448 |
+
|
| 449 |
+
ext = tldextract.extract(target_url)
|
| 450 |
+
tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
|
| 451 |
+
|
| 452 |
+
# First, find best match with original anchor
|
| 453 |
+
query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
|
| 454 |
|
| 455 |
+
try:
|
| 456 |
+
q_emb = embed([query])[0]
|
| 457 |
+
blk_embs = embed(blocks)
|
| 458 |
+
sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
|
| 459 |
+
top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
|
| 460 |
+
except Exception as e:
|
| 461 |
+
print(f"Error in block embedding/similarity: {e}")
|
| 462 |
+
# Fallback to first block
|
| 463 |
+
top_idx = [0]
|
| 464 |
+
|
| 465 |
+
results = []
|
| 466 |
+
for idx in top_idx:
|
| 467 |
+
try:
|
| 468 |
+
blk = blocks[min(idx, len(blocks)-1)] # Ensure valid index
|
| 469 |
+
|
| 470 |
+
# Split sentences more carefully
|
| 471 |
+
sents = re.split(r'(?<=[.!?])\s+', blk)
|
| 472 |
+
# Filter out empty sentences and ensure they have content
|
| 473 |
+
sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
|
| 474 |
+
|
| 475 |
+
if not sents:
|
| 476 |
+
# If no valid sentences, use the whole block
|
| 477 |
+
sents = [blk]
|
| 478 |
+
|
| 479 |
+
best_sent = sents[0] # Default to first sentence
|
| 480 |
+
|
| 481 |
+
try:
|
| 482 |
+
# Only try embedding if we have valid sentences
|
| 483 |
+
if len(sents) > 0 and all(len(s) > 0 for s in sents):
|
| 484 |
+
s_embs = embed(sents)
|
| 485 |
+
s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
|
| 486 |
+
si = int(torch.argmax(s_sims).item()) # Use .item() to get scalar
|
| 487 |
+
if 0 <= si < len(sents):
|
| 488 |
+
best_sent = sents[si]
|
| 489 |
+
except Exception as e:
|
| 490 |
+
print(f"Error in sentence selection: {e}, using first sentence")
|
| 491 |
+
# Keep default (first sentence)
|
| 492 |
+
|
| 493 |
+
# Ensure best_sent is valid before processing
|
| 494 |
+
if not best_sent or len(best_sent.strip()) == 0:
|
| 495 |
+
best_sent = blk if blk else "Unable to extract sentence from this section."
|
| 496 |
+
|
| 497 |
+
rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
|
| 498 |
+
|
| 499 |
+
result = {
|
| 500 |
+
"anchor_was_present": exact_found,
|
| 501 |
+
"best_sentence_original": best_sent,
|
| 502 |
+
"best_sentence_with_anchor": rewritten_sent,
|
| 503 |
+
"keyword_in_article": keyword_present
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
# If anchor not present and alternative suggestion requested
|
| 507 |
+
if suggest_alternative and not keyword_present:
|
| 508 |
+
try:
|
| 509 |
+
# Find a completely different anchor and sentence
|
| 510 |
+
alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
|
| 511 |
+
|
| 512 |
+
if alt_anchor and alt_sentence:
|
| 513 |
+
# Create the sentence with the alternative anchor
|
| 514 |
+
alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
|
| 515 |
+
result["alternative_anchor"] = alt_anchor
|
| 516 |
+
result["alternative_sentence_original"] = alt_sentence
|
| 517 |
+
result["alternative_sentence"] = alt_rewritten
|
| 518 |
+
result["alternative_exact_match"] = alt_exact
|
| 519 |
+
except Exception as e:
|
| 520 |
+
print(f"Error finding alternative anchor: {e}")
|
| 521 |
+
# Continue without alternative
|
| 522 |
+
|
| 523 |
+
results.append(result)
|
| 524 |
+
|
| 525 |
+
except Exception as e:
|
| 526 |
+
print(f"Error processing block {idx}: {e}")
|
| 527 |
+
# Add a fallback result
|
| 528 |
+
results.append({
|
| 529 |
+
"anchor_was_present": False,
|
| 530 |
+
"best_sentence_original": blocks[0] if blocks else "Error extracting content",
|
| 531 |
+
"best_sentence_with_anchor": f"Error processing content. Please try adding the link manually: <a href='{target_url}'>{anchor_text}</a>",
|
| 532 |
+
"keyword_in_article": keyword_present
|
| 533 |
+
})
|
| 534 |
|
| 535 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
| 537 |
+
except Exception as e:
|
| 538 |
+
print(f"Critical error in suggest_insertions: {e}")
|
| 539 |
+
import traceback
|
| 540 |
+
traceback.print_exc()
|
| 541 |
+
return [{
|
| 542 |
+
"error": f"Error processing the page: {str(e)}",
|
| 543 |
+
"anchor_was_present": False,
|
| 544 |
+
"best_sentence_original": "Error occurred",
|
| 545 |
+
"best_sentence_with_anchor": f"Error occurred. Try manually: <a href='{target_url}'>{anchor_text}</a>",
|
| 546 |
+
"keyword_in_article": False
|
| 547 |
+
}]
|
| 548 |
|
| 549 |
# =========================
|
| 550 |
# OpenAI helpers with caching
|