dusan-presswhizz commited on
Commit
bb074b3
Β·
verified Β·
1 Parent(s): c9ce95a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -244
app.py CHANGED
@@ -314,7 +314,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
314
  if i < 0 or i+length > len(words):
315
  continue
316
  phrase = ' '.join(words[i:i+length])
317
- phrase_clean = phrase.strip('.,!?;:"\'')
318
 
319
  # Check if phrase is meaningful
320
  if i < len(words) and i+length-1 < len(words):
@@ -330,7 +330,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
330
 
331
  # Also extract single important words (proper nouns, long words)
332
  for word in words:
333
- clean_word = word.strip('.,!?;:"\'')
334
  if clean_word and (len(clean_word) > 6 or
335
  (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
336
  all_phrases.add(clean_word)
@@ -391,115 +391,6 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
391
  except Exception as e:
392
  print(f"Critical error in find_alternative_anchor: {e}")
393
  return None, None
394
-
395
- except Exception as e:
396
- print(f"Critical error in find_alternative_anchor: {e}")
397
- return None, None
398
-
399
- def analyze_target_url(target_url):
400
- """Deeply analyze the target URL to understand what the page is about."""
401
- try:
402
- # Use the same extraction logic as get_text_blocks
403
- blocks = get_text_blocks(target_url, max_paragraphs=10) # Get more content for better understanding
404
-
405
- # Also get metadata separately
406
- try:
407
- resp = requests.get(target_url, timeout=20, headers=UA)
408
- soup = BeautifulSoup(resp.text, "html.parser")
409
-
410
- # Extract title
411
- title = soup.title.get_text().strip() if soup.title else ""
412
-
413
- # Extract meta description
414
- meta_desc = ""
415
- meta_tag = soup.find("meta", attrs={"name": "description"})
416
- if meta_tag:
417
- meta_desc = meta_tag.get("content", "")
418
-
419
- # Extract h1-h3 headings for topic understanding
420
- headings = []
421
- for h in soup.find_all(['h1', 'h2', 'h3'])[:10]:
422
- heading_text = h.get_text().strip()
423
- if heading_text:
424
- headings.append(heading_text)
425
- except Exception as e:
426
- print(f"Error getting metadata: {e}")
427
- title = ""
428
- meta_desc = ""
429
- headings = []
430
-
431
- # Combine blocks into full text
432
- full_text = " ".join(blocks) if blocks else ""
433
- main_content = full_text[:1500] if full_text else ""
434
-
435
- target_context = {
436
- "title": title,
437
- "meta_description": meta_desc,
438
- "headings": headings,
439
- "main_content": main_content,
440
- "full_text": full_text[:3000], # Limit for embedding
441
- "summary": f"{title} {meta_desc} {' '.join(headings[:5])} {main_content[:500]}"
442
- }
443
-
444
- print(f"\nTarget URL Analysis:")
445
- print(f" Title: {title[:100]}")
446
- print(f" Meta: {meta_desc[:100]}")
447
- print(f" Main headings: {headings[:3]}")
448
- print(f" Extracted {len(blocks)} blocks")
449
-
450
- return target_context
451
-
452
- except Exception as e:
453
- print(f"Error analyzing target URL: {e}")
454
- return {
455
- "title": "",
456
- "meta_description": "",
457
- "headings": [],
458
- "main_content": "",
459
- "full_text": "",
460
- "summary": ""
461
- }
462
-
463
- def validate_anchor_relevance(anchor_text, sentence, target_context, threshold=0.3):
464
- """Check if the anchor and sentence are relevant to the target page content."""
465
- try:
466
- # Create embedding for target page context
467
- target_summary = target_context.get("summary", "")
468
- if not target_summary:
469
- return True # If we can't analyze, assume it's ok
470
-
471
- # Embed target content
472
- target_emb = embed([target_summary])[0]
473
-
474
- # Check anchor relevance to target
475
- anchor_emb = embed([anchor_text])[0]
476
- anchor_relevance = F.cosine_similarity(
477
- anchor_emb.unsqueeze(0),
478
- target_emb.unsqueeze(0)
479
- ).item()
480
-
481
- # Check sentence relevance to target
482
- sentence_emb = embed([sentence])[0]
483
- sentence_relevance = F.cosine_similarity(
484
- sentence_emb.unsqueeze(0),
485
- target_emb.unsqueeze(0)
486
- ).item()
487
-
488
- print(f"\nRelevance scores:")
489
- print(f" Anchor '{anchor_text}' to target: {anchor_relevance:.3f}")
490
- print(f" Sentence to target: {sentence_relevance:.3f}")
491
-
492
- # Return true if either anchor or sentence is relevant enough
493
- is_relevant = anchor_relevance > threshold or sentence_relevance > threshold
494
-
495
- if not is_relevant:
496
- print(f" ⚠️ Low relevance detected! Anchor/sentence may not match target page topic.")
497
-
498
- return is_relevant, anchor_relevance, sentence_relevance
499
-
500
- except Exception as e:
501
- print(f"Error validating relevance: {e}")
502
- return True, 0.5, 0.5 # Default to allowing if error
503
 
504
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
505
  try:
@@ -512,21 +403,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
512
  print(f"DEBUG: Looking for anchor: '{anchor_text}'")
513
  print("="*50)
514
 
515
- # ANALYZE TARGET URL FIRST - This is the key addition
516
- target_context = analyze_target_url(target_url)
517
-
518
- # Validate that the anchor text is relevant to the target page
519
- is_relevant, anchor_score, _ = validate_anchor_relevance(
520
- anchor_text,
521
- anchor_text, # Check anchor against itself first
522
- target_context,
523
- threshold=0.25 # Lower threshold for initial check
524
- )
525
-
526
- if not is_relevant and anchor_score < 0.2:
527
- print(f"\n⚠️ WARNING: Anchor '{anchor_text}' seems unrelated to target page content!")
528
- print(f"Target page appears to be about: {target_context['title'][:100]}")
529
-
530
  # Check if keyword is present in the article
531
  full_text = " ".join(blocks)
532
  full_text_lower = full_text.lower()
@@ -558,11 +434,20 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
558
 
559
  print(f"Keyword present in article: {keyword_present}")
560
 
 
 
 
 
 
 
 
 
 
561
  ext = tldextract.extract(target_url)
562
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
563
 
564
- # Use the comprehensive target context for finding best match
565
- query = f"{anchor_text} β€” relevant to: {target_context['summary'][:200]}"
566
 
567
  try:
568
  q_emb = embed([query])[0]
@@ -619,23 +504,11 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
619
 
620
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
621
 
622
- # Validate the sentence relevance to target before including it
623
- is_relevant, _, sent_relevance = validate_anchor_relevance(
624
- anchor_text,
625
- best_sent,
626
- target_context,
627
- threshold=0.25
628
- )
629
-
630
  result = {
631
  "anchor_was_present": anchor_found_in_sentence,
632
  "best_sentence_original": best_sent,
633
  "best_sentence_with_anchor": rewritten_sent,
634
- "keyword_in_article": keyword_present,
635
- "relevance_score": sent_relevance,
636
- "is_relevant": is_relevant,
637
- "target_title": target_context.get("title", ""),
638
- "target_topic": target_context.get("meta_description", "")[:100]
639
  }
640
 
641
  # If anchor not present in article and alternative suggestion requested
@@ -653,8 +526,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
653
  result["alternative_exact_match"] = alt_exact
654
  except Exception as e:
655
  print(f"Error finding alternative anchor: {e}")
656
- import traceback
657
- traceback.print_exc()
658
  # Continue without alternative
659
 
660
  results.append(result)
@@ -771,89 +642,7 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
771
  # Don't check for exact anchor text match as it might have special chars
772
  return {"sentence_html": out}
773
 
774
- def gpt_get_target_keywords(target_url, target_context, language="English"):
775
- """Ask GPT to suggest 5-10 relevant search keywords users would use to find this page."""
776
- if not OPENAI_API_KEY:
777
- return []
778
-
779
- # Create cache key
780
- cache_key = hashlib.md5(f"keywords_{target_url}{language}".encode()).hexdigest()
781
-
782
- if cache_key in API_RESPONSE_CACHE:
783
- print(f"[GPT] Using cached keywords for {target_url[:30]}...")
784
- return API_RESPONSE_CACHE[cache_key].get("keywords", [])
785
-
786
- title = target_context.get("title", "")
787
- meta = target_context.get("meta_description", "")
788
- content = target_context.get("main_content", "")[:500]
789
-
790
- system = (
791
- "You are an SEO expert. Based on the page content provided, suggest 5-10 search keywords or phrases "
792
- "that users would likely type into Google to find this page. "
793
- "Include both short keywords (1-2 words) and long-tail keywords (3-5 words). "
794
- "Make them realistic search terms, not just words from the page. "
795
- f"Consider the {language} language and local search patterns. "
796
- "Return JSON with a 'keywords' array."
797
- )
798
-
799
- user = {
800
- "url": target_url,
801
- "title": title,
802
- "meta_description": meta,
803
- "content_preview": content,
804
- "task": "Generate search keywords users would use to find this page"
805
- }
806
-
807
- try:
808
- obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
809
- keywords = obj.get("keywords", [])
810
- print(f"\n[GPT] Target page keywords: {keywords}")
811
- return keywords
812
- except Exception as e:
813
- print(f"[GPT] Error getting keywords: {e}")
814
- return []
815
-
816
- def gpt_add_keyword_to_content(blocks, keywords, target_url, language="English"):
817
- """Ask GPT to naturally add one of the keywords to the content with proper context."""
818
- if not OPENAI_API_KEY or not keywords:
819
- return None
820
-
821
- # Create cache key
822
- blocks_preview = " ".join(blocks[:3])[:500]
823
- cache_key = hashlib.md5(f"add_kw_{blocks_preview}{str(keywords)}{target_url}".encode()).hexdigest()
824
-
825
- if cache_key in API_RESPONSE_CACHE:
826
- return API_RESPONSE_CACHE[cache_key]
827
-
828
- system = (
829
- f"You are a skilled content editor writing in {language}. "
830
- "Your task is to naturally integrate ONE of the provided keywords into the article content. "
831
- "RULES: "
832
- "1. Choose the keyword that fits most naturally with the existing content "
833
- "2. Add 2-3 sentences or a short paragraph that includes the keyword "
834
- "3. Make it flow naturally - it should feel like it belongs there "
835
- "4. Include an HTML link using the keyword as anchor text "
836
- "5. Specify WHERE to add it (e.g., 'after the second paragraph', 'before the conclusion') "
837
- "6. The addition should provide value, not just keyword stuffing "
838
- f"7. Write in {language} and preserve special characters "
839
- "Return JSON with: 'keyword_used', 'content_to_add', 'placement_instruction'"
840
- )
841
-
842
- user = {
843
- "article_preview": " ".join(blocks[:5]),
844
- "available_keywords": keywords,
845
- "target_url": target_url,
846
- "language": language,
847
- "task": "Add one keyword naturally to the content"
848
- }
849
-
850
- try:
851
- obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
852
- API_RESPONSE_CACHE[cache_key] = obj
853
- return obj
854
- except Exception as e:
855
- print(f"[GPT] Error adding keyword: {e}")
856
- return None
857
  """
858
  Final QA pass with language support.
859
  """
@@ -949,17 +738,6 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
949
  # Check if anchor was already present in the article
950
  anchor_was_present = res.get("anchor_was_present", False)
951
  keyword_in_article = res.get("keyword_in_article", False)
952
- relevance_score = res.get("relevance_score", 0)
953
- is_relevant = res.get("is_relevant", True)
954
- target_title = res.get("target_title", "")
955
- target_topic = res.get("target_topic", "")
956
-
957
- # Add warning if low relevance detected
958
- relevance_warning = ""
959
- if not is_relevant or relevance_score < 0.25:
960
- relevance_warning = f"\n\n⚠️ **Warning**: The suggested content may not be highly relevant to the target page.\n"
961
- relevance_warning += f"Target page appears to be about: {target_title[:100]}\n"
962
- relevance_warning += f"Relevance score: {relevance_score:.2f}\n"
963
 
964
  # If anchor is present in the article (even if not in the best sentence)
965
  if keyword_in_article:
@@ -968,10 +746,8 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
968
  # Anchor is in the suggested sentence - just show where to add the link
969
  final_output = to_plain_text(draft_html) if plain_text else draft_html
970
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
971
- result += f"πŸ”— Add link here:\n\n"
972
  result += f"{final_output}"
973
- result += relevance_warning
974
- result += relevance_warning
975
  else:
976
  # Anchor is in article but not in this sentence
977
  if smart_rewrite:
@@ -985,7 +761,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
985
  final_output = to_plain_text(final_html) if plain_text else final_html
986
 
987
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
988
- result += f"πŸ”— Add link here:\n\n"
989
  result += f"{final_output}"
990
  else:
991
  # Anchor doesn't exist in article at all - need to add it
@@ -1000,10 +776,9 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
1000
  final_output = to_plain_text(final_html) if plain_text else final_html
1001
 
1002
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
1003
- result += f"πŸ”— Result 1 - Suggested placement:\n\n"
1004
  result += f"Original: {original_sentence}\n\n"
1005
  result += f"Suggested: {final_output}"
1006
- result += relevance_warning
1007
 
1008
  # Show alternative if requested and available
1009
  if suggest_alternative_anchor and res.get("alternative_anchor"):
@@ -1032,7 +807,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
1032
 
1033
  # Add alternative as Result 2
1034
  result += f"\n\n{'='*50}\n\n"
1035
- result += f"πŸ”— Result 2 - Alternative from article:\n"
1036
  result += f"πŸ’‘ Alternative anchor: '{alt_anchor}'\n\n"
1037
  result += f"Original: {alt_sentence_original}\n\n"
1038
  result += f"Suggested: {alt_output}"
 
314
  if i < 0 or i+length > len(words):
315
  continue
316
  phrase = ' '.join(words[i:i+length])
317
+ phrase_clean = phrase.strip('.,!?;:"\' ')
318
 
319
  # Check if phrase is meaningful
320
  if i < len(words) and i+length-1 < len(words):
 
330
 
331
  # Also extract single important words (proper nouns, long words)
332
  for word in words:
333
+ clean_word = word.strip('.,!?;:"\' ')
334
  if clean_word and (len(clean_word) > 6 or
335
  (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
336
  all_phrases.add(clean_word)
 
391
  except Exception as e:
392
  print(f"Critical error in find_alternative_anchor: {e}")
393
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
396
  try:
 
403
  print(f"DEBUG: Looking for anchor: '{anchor_text}'")
404
  print("="*50)
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  # Check if keyword is present in the article
407
  full_text = " ".join(blocks)
408
  full_text_lower = full_text.lower()
 
434
 
435
  print(f"Keyword present in article: {keyword_present}")
436
 
437
+ # Target context for similarity matching
438
+ try:
439
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
440
+ tt = BeautifulSoup(tgt_html, "html.parser").title
441
+ tgt_title = tt.get_text().strip() if tt else ""
442
+ except Exception as e:
443
+ print(f"Error fetching target URL: {e}")
444
+ tgt_title = ""
445
+
446
  ext = tldextract.extract(target_url)
447
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
448
 
449
+ # Find best match with original anchor
450
+ query = f"{anchor_text} β€” relevant to: {tgt_title} ({tgt_domain})"
451
 
452
  try:
453
  q_emb = embed([query])[0]
 
504
 
505
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
506
 
 
 
 
 
 
 
 
 
507
  result = {
508
  "anchor_was_present": anchor_found_in_sentence,
509
  "best_sentence_original": best_sent,
510
  "best_sentence_with_anchor": rewritten_sent,
511
+ "keyword_in_article": keyword_present
 
 
 
 
512
  }
513
 
514
  # If anchor not present in article and alternative suggestion requested
 
526
  result["alternative_exact_match"] = alt_exact
527
  except Exception as e:
528
  print(f"Error finding alternative anchor: {e}")
 
 
529
  # Continue without alternative
530
 
531
  results.append(result)
 
642
  # Don't check for exact anchor text match as it might have special chars
643
  return {"sentence_html": out}
644
 
645
+ def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="English"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  """
647
  Final QA pass with language support.
648
  """
 
738
  # Check if anchor was already present in the article
739
  anchor_was_present = res.get("anchor_was_present", False)
740
  keyword_in_article = res.get("keyword_in_article", False)
 
 
 
 
 
 
 
 
 
 
 
741
 
742
  # If anchor is present in the article (even if not in the best sentence)
743
  if keyword_in_article:
 
746
  # Anchor is in the suggested sentence - just show where to add the link
747
  final_output = to_plain_text(draft_html) if plain_text else draft_html
748
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
749
+ result += f"πŸ“ Add link here:\n\n"
750
  result += f"{final_output}"
 
 
751
  else:
752
  # Anchor is in article but not in this sentence
753
  if smart_rewrite:
 
761
  final_output = to_plain_text(final_html) if plain_text else final_html
762
 
763
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
764
+ result += f"πŸ“ Add link here:\n\n"
765
  result += f"{final_output}"
766
  else:
767
  # Anchor doesn't exist in article at all - need to add it
 
776
  final_output = to_plain_text(final_html) if plain_text else final_html
777
 
778
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
779
+ result += f"πŸ“ Result 1 - Suggested placement:\n\n"
780
  result += f"Original: {original_sentence}\n\n"
781
  result += f"Suggested: {final_output}"
 
782
 
783
  # Show alternative if requested and available
784
  if suggest_alternative_anchor and res.get("alternative_anchor"):
 
807
 
808
  # Add alternative as Result 2
809
  result += f"\n\n{'='*50}\n\n"
810
+ result += f"πŸ“ Result 2 - Alternative from article:\n"
811
  result += f"πŸ’‘ Alternative anchor: '{alt_anchor}'\n\n"
812
  result += f"Original: {alt_sentence_original}\n\n"
813
  result += f"Suggested: {alt_output}"