dusan-presswhizz commited on
Commit
b9459ca
ยท
verified ยท
1 Parent(s): d3ac3e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -50
app.py CHANGED
@@ -261,35 +261,13 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
261
  rewritten = f'{base}{clause}{punct}'
262
  return rewritten, False
263
 
264
- def find_alternative_anchor(blocks, target_url, original_anchor):
265
  """Find a better anchor text from the article that relates to the target URL."""
266
  try:
267
- # Get target page context
268
- try:
269
- tgt_html = requests.get(target_url, timeout=20, headers=UA).text
270
- soup = BeautifulSoup(tgt_html, "html.parser")
271
-
272
- # Extract target page title and meta description
273
- title = soup.title.get_text().strip() if soup.title else ""
274
- meta_desc = ""
275
- meta_tag = soup.find("meta", attrs={"name": "description"})
276
- if meta_tag:
277
- meta_desc = meta_tag.get("content", "")
278
-
279
- # Extract key terms from target page (first few paragraphs)
280
- target_paragraphs = []
281
- for p in soup.find_all("p")[:5]:
282
- text = p.get_text().strip()
283
- if len(text) > 50:
284
- target_paragraphs.append(text)
285
- target_content = " ".join(target_paragraphs[:3])
286
-
287
- except Exception as e:
288
- print(f"Error fetching target URL: {e}")
289
- title = ""
290
- meta_desc = ""
291
- target_content = original_anchor
292
-
293
  # Extract all potential anchor phrases from the source article
294
  all_phrases = set()
295
  full_text = " ".join(blocks)
@@ -314,7 +292,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
314
  if i < 0 or i+length > len(words):
315
  continue
316
  phrase = ' '.join(words[i:i+length])
317
- phrase_clean = phrase.strip('.,!?;:"\' ')
318
 
319
  # Check if phrase is meaningful
320
  if i < len(words) and i+length-1 < len(words):
@@ -330,7 +308,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
330
 
331
  # Also extract single important words (proper nouns, long words)
332
  for word in words:
333
- clean_word = word.strip('.,!?;:"\' ')
334
  if clean_word and (len(clean_word) > 6 or
335
  (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
336
  all_phrases.add(clean_word)
@@ -338,12 +316,12 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
338
  if not all_phrases:
339
  return None, None
340
 
341
- # Create context query from target URL info
342
- target_context = f"{title} {meta_desc} {target_content}"[:500]
343
 
344
  # Score each phrase based on relevance to target
345
  try:
346
- target_emb = embed([target_context])[0]
347
  except:
348
  return None, None
349
 
@@ -362,6 +340,10 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
362
  phrase_emb = embed([phrase])[0]
363
  relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
364
 
 
 
 
 
365
  # Check if this phrase appears in article and find its best context
366
  if phrase.lower() in full_text.lower():
367
  # Find sentences containing this phrase
@@ -376,22 +358,146 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
376
  context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
377
  combined_score = (relevance_score * 0.6) + (context_score * 0.4)
378
 
379
- if combined_score > best_score:
 
380
  best_score = combined_score
381
  best_anchor = phrase
382
  best_sentence = sent
 
383
  except:
384
  continue
385
  except Exception as e:
386
  print(f"Error evaluating phrase '{phrase}': {e}")
387
  continue
388
 
 
 
 
 
 
389
  return best_anchor, best_sentence
390
 
391
  except Exception as e:
392
  print(f"Critical error in find_alternative_anchor: {e}")
393
  return None, None
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
396
  try:
397
  blocks = get_text_blocks(source_url)
@@ -403,6 +509,21 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
403
  print(f"DEBUG: Looking for anchor: '{anchor_text}'")
404
  print("="*50)
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  # Check if keyword is present in the article
407
  full_text = " ".join(blocks)
408
  full_text_lower = full_text.lower()
@@ -434,20 +555,11 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
434
 
435
  print(f"Keyword present in article: {keyword_present}")
436
 
437
- # Target context for similarity matching
438
- try:
439
- tgt_html = requests.get(target_url, timeout=20, headers=UA).text
440
- tt = BeautifulSoup(tgt_html, "html.parser").title
441
- tgt_title = tt.get_text().strip() if tt else ""
442
- except Exception as e:
443
- print(f"Error fetching target URL: {e}")
444
- tgt_title = ""
445
-
446
  ext = tldextract.extract(target_url)
447
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
448
 
449
- # Find best match with original anchor
450
- query = f"{anchor_text} โ€” relevant to: {tgt_title} ({tgt_domain})"
451
 
452
  try:
453
  q_emb = embed([query])[0]
@@ -504,18 +616,31 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
504
 
505
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
506
 
 
 
 
 
 
 
 
 
507
  result = {
508
  "anchor_was_present": anchor_found_in_sentence,
509
  "best_sentence_original": best_sent,
510
  "best_sentence_with_anchor": rewritten_sent,
511
- "keyword_in_article": keyword_present
 
 
 
 
512
  }
513
 
514
  # If anchor not present in article and alternative suggestion requested
515
  if suggest_alternative and not keyword_present:
516
  try:
517
  # Find a completely different anchor and sentence
518
- alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
 
519
 
520
  if alt_anchor and alt_sentence:
521
  # Create the sentence with the alternative anchor
@@ -738,6 +863,17 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
738
  # Check if anchor was already present in the article
739
  anchor_was_present = res.get("anchor_was_present", False)
740
  keyword_in_article = res.get("keyword_in_article", False)
 
 
 
 
 
 
 
 
 
 
 
741
 
742
  # If anchor is present in the article (even if not in the best sentence)
743
  if keyword_in_article:
@@ -746,8 +882,10 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
746
  # Anchor is in the suggested sentence - just show where to add the link
747
  final_output = to_plain_text(draft_html) if plain_text else draft_html
748
  result = warn + f"โœ… **Anchor text '{anchor_text}' found in article!**\n\n"
749
- result += f"๐Ÿ“ Add link here:\n\n"
750
  result += f"{final_output}"
 
 
751
  else:
752
  # Anchor is in article but not in this sentence
753
  if smart_rewrite:
@@ -761,7 +899,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
761
  final_output = to_plain_text(final_html) if plain_text else final_html
762
 
763
  result = warn + f"โœ… **Anchor text '{anchor_text}' found in article!**\n\n"
764
- result += f"๐Ÿ“ Add link here:\n\n"
765
  result += f"{final_output}"
766
  else:
767
  # Anchor doesn't exist in article at all - need to add it
@@ -776,7 +914,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
776
  final_output = to_plain_text(final_html) if plain_text else final_html
777
 
778
  result = warn + f"โš ๏ธ **Anchor text '{anchor_text}' not found in article**\n\n"
779
- result += f"๐Ÿ“ Result 1 - Suggested placement:\n\n"
780
  result += f"Original: {original_sentence}\n\n"
781
  result += f"Suggested: {final_output}"
782
 
@@ -807,7 +945,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
807
 
808
  # Add alternative as Result 2
809
  result += f"\n\n{'='*50}\n\n"
810
- result += f"๐Ÿ“ Result 2 - Alternative from article:\n"
811
  result += f"๐Ÿ’ก Alternative anchor: '{alt_anchor}'\n\n"
812
  result += f"Original: {alt_sentence_original}\n\n"
813
  result += f"Suggested: {alt_output}"
 
261
  rewritten = f'{base}{clause}{punct}'
262
  return rewritten, False
263
 
264
+ def find_alternative_anchor(blocks, target_url, original_anchor, target_context=None):
265
  """Find a better anchor text from the article that relates to the target URL."""
266
  try:
267
+ # Use provided target context or analyze the URL
268
+ if not target_context:
269
+ target_context = analyze_target_url(target_url)
270
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  # Extract all potential anchor phrases from the source article
272
  all_phrases = set()
273
  full_text = " ".join(blocks)
 
292
  if i < 0 or i+length > len(words):
293
  continue
294
  phrase = ' '.join(words[i:i+length])
295
+ phrase_clean = phrase.strip('.,!?;:"\'')
296
 
297
  # Check if phrase is meaningful
298
  if i < len(words) and i+length-1 < len(words):
 
308
 
309
  # Also extract single important words (proper nouns, long words)
310
  for word in words:
311
+ clean_word = word.strip('.,!?;:"\'')
312
  if clean_word and (len(clean_word) > 6 or
313
  (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
314
  all_phrases.add(clean_word)
 
316
  if not all_phrases:
317
  return None, None
318
 
319
+ # Use the comprehensive target context
320
+ target_summary = target_context.get("summary", "")[:500]
321
 
322
  # Score each phrase based on relevance to target
323
  try:
324
+ target_emb = embed([target_summary])[0]
325
  except:
326
  return None, None
327
 
 
340
  phrase_emb = embed([phrase])[0]
341
  relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
342
 
343
+ # Only consider phrases with good relevance to target (threshold)
344
+ if relevance_score < 0.3: # Skip low relevance phrases
345
+ continue
346
+
347
  # Check if this phrase appears in article and find its best context
348
  if phrase.lower() in full_text.lower():
349
  # Find sentences containing this phrase
 
358
  context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
359
  combined_score = (relevance_score * 0.6) + (context_score * 0.4)
360
 
361
+ # Higher threshold for alternative anchors
362
+ if combined_score > best_score and combined_score > 0.35:
363
  best_score = combined_score
364
  best_anchor = phrase
365
  best_sentence = sent
366
+ print(f" Found alternative: '{phrase}' (score: {combined_score:.3f})")
367
  except:
368
  continue
369
  except Exception as e:
370
  print(f"Error evaluating phrase '{phrase}': {e}")
371
  continue
372
 
373
+ if best_anchor:
374
+ print(f"\nโœ“ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
375
+ else:
376
+ print(f"\nโœ— No suitable alternative anchor found with sufficient relevance to target page")
377
+
378
  return best_anchor, best_sentence
379
 
380
  except Exception as e:
381
  print(f"Critical error in find_alternative_anchor: {e}")
382
  return None, None
383
 
384
+ def analyze_target_url(target_url):
385
+ """Deeply analyze the target URL to understand what the page is about."""
386
+ try:
387
+ # Try Trafilatura first for better extraction
388
+ downloaded = trafilatura.fetch_url(target_url)
389
+ target_text = trafilatura.extract(downloaded,
390
+ include_comments=False,
391
+ include_tables=False,
392
+ deduplicate=True,
393
+ output_format='txt',
394
+ favor_precision=False)
395
+
396
+ if not target_text:
397
+ # Fallback to BeautifulSoup
398
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
399
+ soup = BeautifulSoup(tgt_html, "html.parser")
400
+
401
+ # Remove unwanted elements
402
+ for tag in soup(["script","style","noscript","header","footer","nav","aside","form"]):
403
+ tag.decompose()
404
+
405
+ # Get text content
406
+ target_text = soup.get_text(separator=" ", strip=True) if soup.body else ""
407
+
408
+ # Also get metadata
409
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text if not downloaded else tgt_html
410
+ soup = BeautifulSoup(tgt_html, "html.parser")
411
+
412
+ # Extract title
413
+ title = soup.title.get_text().strip() if soup.title else ""
414
+
415
+ # Extract meta description
416
+ meta_desc = ""
417
+ meta_tag = soup.find("meta", attrs={"name": "description"})
418
+ if meta_tag:
419
+ meta_desc = meta_tag.get("content", "")
420
+
421
+ # Extract h1-h3 headings for topic understanding
422
+ headings = []
423
+ for h in soup.find_all(['h1', 'h2', 'h3'])[:10]:
424
+ heading_text = h.get_text().strip()
425
+ if heading_text:
426
+ headings.append(heading_text)
427
+
428
+ # Create a comprehensive summary of what the page is about
429
+ # Take first 1500 chars of main content for context
430
+ main_content = target_text[:1500] if target_text else ""
431
+
432
+ target_context = {
433
+ "title": title,
434
+ "meta_description": meta_desc,
435
+ "headings": headings,
436
+ "main_content": main_content,
437
+ "full_text": target_text[:3000] if target_text else "", # Limit for embedding
438
+ "summary": f"{title} {meta_desc} {' '.join(headings[:5])} {main_content[:500]}"
439
+ }
440
+
441
+ print(f"\nTarget URL Analysis:")
442
+ print(f" Title: {title[:100]}")
443
+ print(f" Meta: {meta_desc[:100]}")
444
+ print(f" Main headings: {headings[:3]}")
445
+ print(f" Content preview: {main_content[:200]}...")
446
+
447
+ return target_context
448
+
449
+ except Exception as e:
450
+ print(f"Error analyzing target URL: {e}")
451
+ return {
452
+ "title": "",
453
+ "meta_description": "",
454
+ "headings": [],
455
+ "main_content": "",
456
+ "full_text": "",
457
+ "summary": anchor_text # Fallback to anchor text if can't analyze
458
+ }
459
+
460
+ def validate_anchor_relevance(anchor_text, sentence, target_context, threshold=0.3):
461
+ """Check if the anchor and sentence are relevant to the target page content."""
462
+ try:
463
+ # Create embedding for target page context
464
+ target_summary = target_context.get("summary", "")
465
+ if not target_summary:
466
+ return True # If we can't analyze, assume it's ok
467
+
468
+ # Embed target content
469
+ target_emb = embed([target_summary])[0]
470
+
471
+ # Check anchor relevance to target
472
+ anchor_emb = embed([anchor_text])[0]
473
+ anchor_relevance = F.cosine_similarity(
474
+ anchor_emb.unsqueeze(0),
475
+ target_emb.unsqueeze(0)
476
+ ).item()
477
+
478
+ # Check sentence relevance to target
479
+ sentence_emb = embed([sentence])[0]
480
+ sentence_relevance = F.cosine_similarity(
481
+ sentence_emb.unsqueeze(0),
482
+ target_emb.unsqueeze(0)
483
+ ).item()
484
+
485
+ print(f"\nRelevance scores:")
486
+ print(f" Anchor '{anchor_text}' to target: {anchor_relevance:.3f}")
487
+ print(f" Sentence to target: {sentence_relevance:.3f}")
488
+
489
+ # Return true if either anchor or sentence is relevant enough
490
+ is_relevant = anchor_relevance > threshold or sentence_relevance > threshold
491
+
492
+ if not is_relevant:
493
+ print(f" โš ๏ธ Low relevance detected! Anchor/sentence may not match target page topic.")
494
+
495
+ return is_relevant, anchor_relevance, sentence_relevance
496
+
497
+ except Exception as e:
498
+ print(f"Error validating relevance: {e}")
499
+ return True, 0.5, 0.5 # Default to allowing if error
500
+
501
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
502
  try:
503
  blocks = get_text_blocks(source_url)
 
509
  print(f"DEBUG: Looking for anchor: '{anchor_text}'")
510
  print("="*50)
511
 
512
+ # ANALYZE TARGET URL FIRST - This is the key addition
513
+ target_context = analyze_target_url(target_url)
514
+
515
+ # Validate that the anchor text is relevant to the target page
516
+ is_relevant, anchor_score, _ = validate_anchor_relevance(
517
+ anchor_text,
518
+ anchor_text, # Check anchor against itself first
519
+ target_context,
520
+ threshold=0.25 # Lower threshold for initial check
521
+ )
522
+
523
+ if not is_relevant and anchor_score < 0.2:
524
+ print(f"\nโš ๏ธ WARNING: Anchor '{anchor_text}' seems unrelated to target page content!")
525
+ print(f"Target page appears to be about: {target_context['title'][:100]}")
526
+
527
  # Check if keyword is present in the article
528
  full_text = " ".join(blocks)
529
  full_text_lower = full_text.lower()
 
555
 
556
  print(f"Keyword present in article: {keyword_present}")
557
 
 
 
 
 
 
 
 
 
 
558
  ext = tldextract.extract(target_url)
559
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
560
 
561
+ # Use the comprehensive target context for finding best match
562
+ query = f"{anchor_text} โ€” relevant to: {target_context['summary'][:200]}"
563
 
564
  try:
565
  q_emb = embed([query])[0]
 
616
 
617
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
618
 
619
+ # Validate the sentence relevance to target before including it
620
+ is_relevant, _, sent_relevance = validate_anchor_relevance(
621
+ anchor_text,
622
+ best_sent,
623
+ target_context,
624
+ threshold=0.25
625
+ )
626
+
627
  result = {
628
  "anchor_was_present": anchor_found_in_sentence,
629
  "best_sentence_original": best_sent,
630
  "best_sentence_with_anchor": rewritten_sent,
631
+ "keyword_in_article": keyword_present,
632
+ "relevance_score": sent_relevance,
633
+ "is_relevant": is_relevant,
634
+ "target_title": target_context.get("title", ""),
635
+ "target_topic": target_context.get("meta_description", "")[:100]
636
  }
637
 
638
  # If anchor not present in article and alternative suggestion requested
639
  if suggest_alternative and not keyword_present:
640
  try:
641
  # Find a completely different anchor and sentence
642
+ # Pass the target_context we already analyzed
643
+ alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
644
 
645
  if alt_anchor and alt_sentence:
646
  # Create the sentence with the alternative anchor
 
863
  # Check if anchor was already present in the article
864
  anchor_was_present = res.get("anchor_was_present", False)
865
  keyword_in_article = res.get("keyword_in_article", False)
866
+ relevance_score = res.get("relevance_score", 0)
867
+ is_relevant = res.get("is_relevant", True)
868
+ target_title = res.get("target_title", "")
869
+ target_topic = res.get("target_topic", "")
870
+
871
+ # Add warning if low relevance detected
872
+ relevance_warning = ""
873
+ if not is_relevant or relevance_score < 0.25:
874
+ relevance_warning = f"\n\nโš ๏ธ **Warning**: The suggested content may not be highly relevant to the target page.\n"
875
+ relevance_warning += f"Target page appears to be about: {target_title[:100]}\n"
876
+ relevance_warning += f"Relevance score: {relevance_score:.2f}\n"
877
 
878
  # If anchor is present in the article (even if not in the best sentence)
879
  if keyword_in_article:
 
882
  # Anchor is in the suggested sentence - just show where to add the link
883
  final_output = to_plain_text(draft_html) if plain_text else draft_html
884
  result = warn + f"โœ… **Anchor text '{anchor_text}' found in article!**\n\n"
885
+ result += f"๐Ÿ”— Add link here:\n\n"
886
  result += f"{final_output}"
887
+ result += relevance_warning
888
+ result += relevance_warning
889
  else:
890
  # Anchor is in article but not in this sentence
891
  if smart_rewrite:
 
899
  final_output = to_plain_text(final_html) if plain_text else final_html
900
 
901
  result = warn + f"โœ… **Anchor text '{anchor_text}' found in article!**\n\n"
902
+ result += f"๐Ÿ”— Add link here:\n\n"
903
  result += f"{final_output}"
904
  else:
905
  # Anchor doesn't exist in article at all - need to add it
 
914
  final_output = to_plain_text(final_html) if plain_text else final_html
915
 
916
  result = warn + f"โš ๏ธ **Anchor text '{anchor_text}' not found in article**\n\n"
917
+ result += f"๐Ÿ”— Result 1 - Suggested placement:\n\n"
918
  result += f"Original: {original_sentence}\n\n"
919
  result += f"Suggested: {final_output}"
920
 
 
945
 
946
  # Add alternative as Result 2
947
  result += f"\n\n{'='*50}\n\n"
948
+ result += f"๐Ÿ”— Result 2 - Alternative from article:\n"
949
  result += f"๐Ÿ’ก Alternative anchor: '{alt_anchor}'\n\n"
950
  result += f"Original: {alt_sentence_original}\n\n"
951
  result += f"Suggested: {alt_output}"