dusan-presswhizz commited on
Commit
037345b
Β·
verified Β·
1 Parent(s): e3a599d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -148
app.py CHANGED
@@ -1,114 +1,4 @@
1
- def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
2
- if not source_url or not target_url or not anchor_text:
3
- return "❌ Please provide Source URL, Target URL, and Anchor Text."
4
-
5
- # Auto-correct swapped inputs
6
- warn = ""
7
- if looks_like_url(anchor_text) and not looks_like_url(target_url):
8
- anchor_text, target_url = target_url, anchor_text
9
- warn = "ℹ️ Detected swapped inputs. I used the URL as Target URL and the text as Anchor.\n\n"
10
-
11
- source_url = normalize_url(source_url)
12
- target_url = normalize_url(target_url)
13
-
14
- try:
15
- # Get source blocks for later use
16
- source_blocks = get_text_blocks(source_url)
17
-
18
- # Only get 1 result, not multiple!
19
- results = suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=suggest_alternative_anchor)
20
- res = results[0]
21
- except Exception as e:
22
- return f"❌ Error processing the page: {str(e)}"
23
-
24
- if "error" in res:
25
- return f"❌ {res['error']}"
26
-
27
- # Detect language from the original sentence
28
- original_sentence = res['best_sentence_original']
29
- detected_lang = detect_language(original_sentence)
30
- language_name = get_language_name(detected_lang)
31
- print(f"Detected language: {language_name} ({detected_lang})")
32
-
33
- # Process original anchor suggestion
34
- draft_html = res["best_sentence_with_anchor"]
35
-
36
- # Check if anchor was already present in the article
37
- anchor_was_present = res.get("anchor_was_present", False)
38
- keyword_in_article = res.get("keyword_in_article", False)
39
-
40
- # If anchor is present in the article (even if not in the best sentence)
41
- if keyword_in_article:
42
- # Anchor exists somewhere in article
43
- if anchor_was_present:
44
- # Anchor is in the suggested sentence - just show where to add the link
45
- final_output = to_plain_text(draft_html) if plain_text else draft_html
46
- result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
47
- result += f"πŸ”— Add link here:\n\n"
48
- result += f"{final_output}"
49
- else:
50
- # Anchor is in article but not in this sentence
51
- if smart_rewrite:
52
- g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
53
- final_html = g["sentence_html"]
54
- else:
55
- final_html = draft_html
56
-
57
- polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
58
- final_html = polished.get("sentence_html", final_html)
59
- final_output = to_plain_text(final_html) if plain_text else final_html
60
-
61
- result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
62
- result += f"πŸ”— Add link here:\n\n"
63
- result += f"{final_output}"
64
- else:
65
- # Anchor doesn't exist in article at all - need to add it
66
- if smart_rewrite:
67
- g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
68
- final_html = g["sentence_html"]
69
- else:
70
- final_html = draft_html
71
-
72
- polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
73
- final_html = polished.get("sentence_html", final_html)
74
- final_output = to_plain_text(final_html) if plain_text else final_html
75
-
76
- result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
77
- result += f"πŸ”— Result 1 - Suggested placement:\n\n"
78
- result += f"Change this sentence: {original_sentence}\n\n"
79
- result += f"With this one: {final_output}"
80
-
81
- # Show alternative if requested and available
82
- if suggest_alternative_anchor and res.get("alternative_anchor"):
83
- alt_anchor = res["alternative_anchor"]
84
- alt_content = res.get("alternative_sentence", "") # This now contains position info + content
85
-
86
- if alt_content:
87
- # The content already has the link included from GPT
88
- # Parse the position text and actual content
89
- if "[Insert after paragraph" in alt_content:
90
- parts = alt_content.split("\n\n", 1)
91
- position_and_sentence = parts[0] if len(parts) > 0 else ""
92
- actual_content = parts[1] if len(parts) > 1 else alt_content
93
- else:
94
- position_and_sentence = ""
95
- actual_content = alt_content
96
-
97
- alt_output = to_plain_text(actual_content) if plain_text else actual_content
98
-
99
- # Add alternative as Result 2
100
- result += f"\n\n{'='*50}\n\n"
101
- result += f"πŸ”— Result 2 - Suggested new anchor with placement:\n"
102
- result += f"πŸ’‘ Using keyword: '{alt_anchor}'\n"
103
-
104
- if position_and_sentence:
105
- result += f"\n{position_and_sentence}\n\n"
106
-
107
- result += f"Add this: {alt_output}"
108
-
109
- return result:
110
- # No paragraph info available
111
- result += f"\n{alt_output}"import os, re, json, requests, urllib.parse
112
  import torch, torch.nn.functional as F
113
  from bs4 import BeautifulSoup
114
  from transformers import AutoTokenizer, AutoModel
@@ -417,11 +307,11 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
417
  chosen_keyword = result.get("chosen_keyword", keywords[0] if keywords else original_anchor)
418
  new_content = result.get("new_content", "")
419
  insert_after = result.get("insert_after_paragraph", 0)
420
- after_sentence = result.get("after_sentence", "")
421
 
422
  # Format the response for compatibility
423
  # Return: (anchor_text, formatted_content_with_position)
424
- position_text = f"[Insert after paragraph {insert_after + 1}]\nAfter this sentence: {after_sentence}"
425
 
426
  return chosen_keyword, f"{position_text}\n\n{new_content}"
427
 
@@ -645,7 +535,7 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
645
  "(2) Do NOT use an em dash or any dash. "
646
  '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
647
  "Prefer integrating the anchor as part of the sentence. "
648
- f"(4) Write in {language} and preserve ALL special characters (Δ‡, Δ‘, Ε‘, ΕΎ, č, etc.). "
649
  "Return a compact JSON object with key sentence_html only."
650
  )
651
 
@@ -770,7 +660,21 @@ def gpt_get_search_keywords(target_content, target_url):
770
 
771
  return obj.get("keywords", ["related content"])
772
 
773
- ates the most suitable keyword. "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
  "The addition should flow seamlessly with the existing content. "
775
  "\n\nYOUR TASK:\n"
776
  "1. Choose the ONE keyword that fits most naturally with the article's context\n"
@@ -891,8 +795,8 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
891
 
892
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
893
  result += f"πŸ”— Result 1 - Suggested placement:\n\n"
894
- result += f"Change this sentence: {original_sentence}\n\n"
895
- result += f"With this one: {final_output}"
896
 
897
  # Show alternative if requested and available
898
  if suggest_alternative_anchor and res.get("alternative_anchor"):
@@ -901,47 +805,24 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
901
 
902
  if alt_content:
903
  # Parse if there's position information
904
- paragraph_num = None
905
- actual_content = alt_content
906
-
907
  if "[Insert after paragraph" in alt_content:
908
  parts = alt_content.split("\n\n", 1)
909
  position_info = parts[0] if len(parts) > 0 else ""
910
  actual_content = parts[1] if len(parts) > 1 else alt_content
911
-
912
- # Extract paragraph number from position_info
913
- import re
914
- match = re.search(r'\[Insert after paragraph (\d+)\]', position_info)
915
- if match:
916
- paragraph_num = int(match.group(1)) - 1 # Convert to 0-based index
917
 
918
  # The content already has the link included from GPT
919
  alt_output = to_plain_text(actual_content) if plain_text else actual_content
920
 
921
  # Add alternative as Result 2
922
  result += f"\n\n{'='*50}\n\n"
923
- result += f"πŸ”— Result 2 - Suggested new anchor with placement:\n"
924
  result += f"πŸ’‘ Using keyword: '{alt_anchor}'\n"
925
-
926
- # Try to get the actual paragraph sentence to show where to insert
927
- if paragraph_num is not None and paragraph_num < len(source_blocks):
928
- try:
929
- # Get the last sentence of the target paragraph
930
- target_paragraph = source_blocks[paragraph_num]
931
- # Split into sentences and get the last one
932
- sentences = re.split(r'(?<=[.!?])\s+', target_paragraph.strip())
933
- last_sentence = sentences[-1] if sentences else target_paragraph[:100] + "..."
934
-
935
- result += f"\n[Insert after paragraph {paragraph_num + 1}]\n"
936
- result += f"After this sentence: {last_sentence}\n\n"
937
- result += f"Add this: {alt_output}"
938
- except:
939
- # Fallback if we can't get the paragraph
940
- result += f"\n[Insert after paragraph {paragraph_num + 1}]\n\n"
941
- result += f"{alt_output}"
942
- else:
943
- # No paragraph info available
944
- result += f"\n{alt_output}"
945
 
946
  return result
947
 
@@ -998,7 +879,7 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
998
 
999
  gr.Markdown("""
1000
  ### Features:
1001
- - 🌍 **Auto Language Detection**: Preserves special characters (Δ‡, Δ‘, Ε‘, ΕΎ, č, etc.)
1002
  - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
1003
  - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
1004
  - πŸ”„ **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text
 
1
+ import os, re, json, requests, urllib.parse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch, torch.nn.functional as F
3
  from bs4 import BeautifulSoup
4
  from transformers import AutoTokenizer, AutoModel
 
307
  chosen_keyword = result.get("chosen_keyword", keywords[0] if keywords else original_anchor)
308
  new_content = result.get("new_content", "")
309
  insert_after = result.get("insert_after_paragraph", 0)
310
+ reasoning = result.get("reasoning", "")
311
 
312
  # Format the response for compatibility
313
  # Return: (anchor_text, formatted_content_with_position)
314
+ position_text = f"[Insert after paragraph {insert_after + 1}]: {reasoning}"
315
 
316
  return chosen_keyword, f"{position_text}\n\n{new_content}"
317
 
 
535
  "(2) Do NOT use an em dash or any dash. "
536
  '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
537
  "Prefer integrating the anchor as part of the sentence. "
538
+ f"(4) Write in {language} and preserve ALL special characters (Δ‡, č, Ε‘, ΕΎ, Δ‘, etc.). "
539
  "Return a compact JSON object with key sentence_html only."
540
  )
541
 
 
660
 
661
  return obj.get("keywords", ["related content"])
662
 
663
+ def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
664
+ """
665
+ Generate new content with the best keyword and specify where to insert it.
666
+ """
667
+ if not OPENAI_API_KEY or not keywords:
668
+ return None
669
+
670
+ # Create cache key
671
+ source_preview = " ".join(source_blocks[:3])[:500]
672
+ cache_key = hashlib.md5(f"generate_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
673
+
674
+ system = (
675
+ f"You are a skilled content writer writing in {language}. "
676
+ "Given an article and a list of keywords related to a target page, "
677
+ "create a NATURAL addition to the article that incorporates the most suitable keyword. "
678
  "The addition should flow seamlessly with the existing content. "
679
  "\n\nYOUR TASK:\n"
680
  "1. Choose the ONE keyword that fits most naturally with the article's context\n"
 
795
 
796
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
797
  result += f"πŸ”— Result 1 - Suggested placement:\n\n"
798
+ result += f"Original: {original_sentence}\n\n"
799
+ result += f"Suggested: {final_output}"
800
 
801
  # Show alternative if requested and available
802
  if suggest_alternative_anchor and res.get("alternative_anchor"):
 
805
 
806
  if alt_content:
807
  # Parse if there's position information
 
 
 
808
  if "[Insert after paragraph" in alt_content:
809
  parts = alt_content.split("\n\n", 1)
810
  position_info = parts[0] if len(parts) > 0 else ""
811
  actual_content = parts[1] if len(parts) > 1 else alt_content
812
+ else:
813
+ position_info = ""
814
+ actual_content = alt_content
 
 
 
815
 
816
  # The content already has the link included from GPT
817
  alt_output = to_plain_text(actual_content) if plain_text else actual_content
818
 
819
  # Add alternative as Result 2
820
  result += f"\n\n{'='*50}\n\n"
821
+ result += f"πŸ”— Result 2 - Suggested new content to add:\n"
822
  result += f"πŸ’‘ Using keyword: '{alt_anchor}'\n"
823
+ if position_info:
824
+ result += f"πŸ“ {position_info}\n"
825
+ result += f"\n{alt_output}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
 
827
  return result
828
 
 
879
 
880
  gr.Markdown("""
881
  ### Features:
882
+ - 🌍 **Auto Language Detection**: Preserves special characters (Δ‡, č, Ε‘, ΕΎ, Δ‘, etc.)
883
  - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
884
  - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
885
  - πŸ”„ **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text