dusan-presswhizz commited on
Commit
d6c6bec
Β·
verified Β·
1 Parent(s): 40600a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -106
app.py CHANGED
@@ -137,9 +137,8 @@ def is_likely_author_bio_or_footer(element, text):
137
  return False
138
 
139
  def get_text_blocks(url, max_paragraphs=7):
140
- """Extract text blocks, prioritizing main content paragraphs."""
141
  try:
142
- # Try with a more complete User-Agent that mimics a real browser
143
  headers = {
144
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
145
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -152,13 +151,12 @@ def get_text_blocks(url, max_paragraphs=7):
152
 
153
  resp = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
154
  resp.raise_for_status()
155
- # Ensure proper encoding
156
  resp.encoding = resp.apparent_encoding
157
  soup = BeautifulSoup(resp.text, "html.parser")
158
  except Exception as e:
159
  print(f"Error fetching URL {url}: {e}")
160
- # Try one more time with just the basic User-Agent
161
  try:
 
162
  resp = requests.get(url, timeout=20, headers=UA)
163
  resp.raise_for_status()
164
  resp.encoding = resp.apparent_encoding
@@ -171,63 +169,88 @@ def get_text_blocks(url, max_paragraphs=7):
171
  tag.decompose()
172
 
173
  blocks = []
174
- paragraph_count = 0
175
-
176
- # Try to find main content area - add more potential content container names
177
- main_content = (
178
- soup.find('main') or
179
- soup.find('article') or
180
- soup.find('div', class_=re.compile('content|main|article|post|entry|blog', re.I)) or
181
- soup.find('div', id=re.compile('content|main|article|post|entry', re.I)) or
182
- soup.find('div', class_='container') or
183
- soup.find('div', role='main')
184
- )
 
 
 
 
 
 
 
 
 
185
 
186
  if not main_content:
187
  main_content = soup.body if soup.body else soup
188
 
189
- # If still no paragraphs found, try a more aggressive approach
190
- elements_to_check = main_content.find_all(["p","li","h2","h3","h4","blockquote","div"])
191
-
192
- for el in elements_to_check:
193
- # Skip if likely author bio or footer content
194
- if is_likely_author_bio_or_footer(el, el.get_text()):
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  continue
196
-
197
- txt = " ".join(el.get_text(" ", strip=True).split())
198
 
199
- # For divs, only include if they have substantial text and no nested block elements
200
- if el.name == 'div':
201
- # Skip divs that contain other block elements (they're containers)
202
- if el.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section']):
203
- continue
204
- # Only include divs with substantial text content
205
- if len(txt) < 100:
206
  continue
207
 
208
- if len(txt) > 60:
209
- # Avoid duplicate content
210
- if txt not in blocks:
211
- blocks.append(txt)
212
- if el.name in ['p', 'div']: # Count paragraphs and text divs
213
- paragraph_count += 1
214
- if paragraph_count >= max_paragraphs:
215
- break
216
-
217
- # If we still have no blocks, try to get ANY text from the page
218
- if not blocks:
219
- print(f"Warning: No standard blocks found, attempting fallback extraction for {url}")
220
- # Get all text from body
221
- if soup.body:
222
- all_text = soup.body.get_text(separator="\n")
223
- # Split by newlines and filter
224
- lines = all_text.split('\n')
225
- for line in lines:
226
- line = " ".join(line.split()).strip()
227
- if len(line) > 100: # Only substantial lines
228
- blocks.append(line)
229
- if len(blocks) >= max_paragraphs:
230
- break
 
 
 
 
 
 
 
 
 
231
 
232
  return blocks
233
 
@@ -287,15 +310,6 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
287
  rewritten = f'{base}{clause}{punct}'
288
  return rewritten, False
289
 
290
- def create_anchor_suggestion(anchor_text, target_url):
291
- """Create a suggestion for where to add the anchor when it's not found in the article."""
292
- suggestions = [
293
- f'Consider adding a new sentence like: "For more information on this topic, see <a href="{target_url}">{anchor_text}</a>."',
294
- f'You could add: "Additional insights can be found in <a href="{target_url}">{anchor_text}</a>."',
295
- f'Suggestion: "This relates to concepts discussed in <a href="{target_url}">{anchor_text}</a>."'
296
- ]
297
- return suggestions[0]
298
-
299
  def find_alternative_anchor(blocks, target_url, original_anchor):
300
  """Find a better anchor text from the article that relates to the target URL."""
301
  try:
@@ -433,21 +447,43 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
433
  if not blocks:
434
  return [{"error":"No text blocks found on the page."}]
435
 
436
- # Check if keyword is present in the article - need to check case-insensitively
 
 
 
 
 
437
  full_text = " ".join(blocks)
438
  full_text_lower = full_text.lower()
439
  anchor_text_lower = anchor_text.lower() if anchor_text else ""
440
- keyword_present = anchor_text_lower in full_text_lower
441
-
442
- # Also check with normalized text (removing special chars)
 
 
 
 
 
 
 
443
  if not keyword_present:
444
- # Try normalized search
445
- import re
446
- normalized_text = re.sub(r'[^a-z0-9\s]', '', full_text_lower)
447
- normalized_anchor = re.sub(r'[^a-z0-9\s]', '', anchor_text_lower)
448
- keyword_present = normalized_anchor in normalized_text
 
 
 
 
 
 
 
 
 
 
449
 
450
- # target context
451
  try:
452
  tgt_html = requests.get(target_url, timeout=20, headers=UA).text
453
  tt = BeautifulSoup(tgt_html, "html.parser").title
@@ -459,8 +495,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
459
  ext = tldextract.extract(target_url)
460
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
461
 
462
- # First, find best match with original anchor
463
- query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
464
 
465
  try:
466
  q_emb = embed([query])[0]
@@ -475,7 +511,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
475
  results = []
476
  for idx in top_idx:
477
  try:
478
- blk = blocks[min(idx, len(blocks)-1)] # Ensure valid index
 
479
 
480
  # Split sentences more carefully
481
  sents = re.split(r'(?<=[.!?])\s+', blk)
@@ -493,7 +530,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
493
  if len(sents) > 0 and all(len(s) > 0 for s in sents):
494
  s_embs = embed(sents)
495
  s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
496
- si = int(torch.argmax(s_sims).item()) # Use .item() to get scalar
497
  if 0 <= si < len(sents):
498
  best_sent = sents[si]
499
  except Exception as e:
@@ -504,23 +541,23 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
504
  if not best_sent or len(best_sent.strip()) == 0:
505
  best_sent = blk if blk else "Unable to extract sentence from this section."
506
 
507
- # Check if anchor is in THIS specific sentence (case-insensitive)
508
  sentence_lower = best_sent.lower()
509
  anchor_found_in_sentence = anchor_text_lower in sentence_lower
510
 
511
  # If not found with exact match, try normalized
512
  if not anchor_found_in_sentence:
513
- normalized_sent = re.sub(r'[^a-z0-9\s]', '', sentence_lower)
514
- normalized_anchor = re.sub(r'[^a-z0-9\s]', '', anchor_text_lower)
515
  anchor_found_in_sentence = normalized_anchor in normalized_sent
516
 
517
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
518
 
519
  result = {
520
- "anchor_was_present": anchor_found_in_sentence, # Use the sentence-specific check
521
  "best_sentence_original": best_sent,
522
  "best_sentence_with_anchor": rewritten_sent,
523
- "keyword_in_article": keyword_present # This is for the whole article
524
  }
525
 
526
  # If anchor not present in article and alternative suggestion requested
@@ -544,6 +581,8 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
544
 
545
  except Exception as e:
546
  print(f"Error processing block {idx}: {e}")
 
 
547
  # Add a fallback result
548
  results.append({
549
  "anchor_was_present": False,
@@ -618,7 +657,7 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
618
  "(2) Do NOT use an em dash or any dash. "
619
  '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
620
  "Prefer integrating the anchor as part of the sentence. "
621
- f"(4) Write in {language} and preserve ALL special characters (č, Δ‡, Ε‘, ΕΎ, Δ‘, etc.). "
622
  "Return a compact JSON object with key sentence_html only."
623
  )
624
 
@@ -711,7 +750,7 @@ def to_plain_text(html_or_text):
711
  return html.unescape(text)
712
 
713
  # =========================
714
- # Gradio UI - FIXED VERSION
715
  # =========================
716
  def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
717
  if not source_url or not target_url or not anchor_text:
@@ -747,43 +786,55 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
747
 
748
  # Check if anchor was already present in the article
749
  anchor_was_present = res.get("anchor_was_present", False)
750
-
751
- # Only apply GPT rewriting if anchor wasn't already present
752
- # If anchor is present, we just want to show where to add the link
753
- if anchor_was_present:
754
- # Anchor exists - just show where to add the link
755
- final_output = to_plain_text(draft_html) if plain_text else draft_html
756
- result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
757
- result += f"πŸ“ Add link here:\n\n"
758
- result += f"Original: {original_sentence}\n\n"
759
- result += f"With link: {final_output}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  else:
761
- # Anchor doesn't exist - need to add it to the sentence
762
- # 1) Optional first-pass rewrite with language support
763
  if smart_rewrite:
764
  g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
765
  final_html = g["sentence_html"]
766
  else:
767
  final_html = draft_html
768
 
769
- # 2) QA/polish pass with language support
770
  polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
771
  final_html = polished.get("sentence_html", final_html)
772
-
773
- # 3) Optionally convert to plain text
774
  final_output = to_plain_text(final_html) if plain_text else final_html
775
 
776
- # Build the result for when anchor is NOT present
777
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
778
  result += f"πŸ“ Result 1 - Suggested placement:\n\n"
779
  result += f"Original: {original_sentence}\n\n"
780
  result += f"Suggested: {final_output}"
781
 
782
- # ONLY show alternative if:
783
- # 1. suggest_alternative_anchor is True
784
- # 2. The original anchor was NOT found in the article
785
- # 3. We have an alternative suggestion
786
- if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
787
  alt_anchor = res["alternative_anchor"]
788
  alt_sentence_original = res.get("alternative_sentence_original", "")
789
  alt_sentence = res.get("alternative_sentence", "")
@@ -844,7 +895,7 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
844
  plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
845
  suggest_alternative_anchor = gr.Checkbox(
846
  label="Suggest alternative anchor",
847
- value=True, # ← CHANGED TO TRUE (DEFAULT CHECKED)
848
  info="If anchor not found, suggest a better anchor from the article"
849
  )
850
 
@@ -869,7 +920,7 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
869
 
870
  gr.Markdown("""
871
  ### Features:
872
- - 🌍 **Auto Language Detection**: Preserves special characters (č, Δ‡, Ε‘, ΕΎ, Δ‘, etc.)
873
  - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
874
  - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
875
  - πŸ”„ **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text
 
137
  return False
138
 
139
  def get_text_blocks(url, max_paragraphs=7):
140
+ """Extract text blocks with improved extraction that captures all content."""
141
  try:
 
142
  headers = {
143
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
144
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 
151
 
152
  resp = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
153
  resp.raise_for_status()
 
154
  resp.encoding = resp.apparent_encoding
155
  soup = BeautifulSoup(resp.text, "html.parser")
156
  except Exception as e:
157
  print(f"Error fetching URL {url}: {e}")
 
158
  try:
159
+ # Fallback to simpler headers
160
  resp = requests.get(url, timeout=20, headers=UA)
161
  resp.raise_for_status()
162
  resp.encoding = resp.apparent_encoding
 
169
  tag.decompose()
170
 
171
  blocks = []
172
+ seen_texts = set()
173
+
174
+ # Try to find main content area - be more inclusive
175
+ main_content = None
176
+ content_selectors = [
177
+ ('main', {}),
178
+ ('article', {}),
179
+ ('div', {'class': re.compile('content|main|article|post|entry|blog|body|wrapper', re.I)}),
180
+ ('div', {'id': re.compile('content|main|article|post|entry|body|wrapper', re.I)}),
181
+ ('div', {'role': 'main'}),
182
+ ('div', {'class': 'container'}),
183
+ ]
184
+
185
+ for tag, attrs in content_selectors:
186
+ if attrs:
187
+ main_content = soup.find(tag, attrs)
188
+ else:
189
+ main_content = soup.find(tag)
190
+ if main_content:
191
+ break
192
 
193
  if not main_content:
194
  main_content = soup.body if soup.body else soup
195
 
196
+ # Method 1: Get ALL text from the main content area first
197
+ # This ensures we don't miss any content
198
+ full_text = main_content.get_text(separator="\n")
199
+
200
+ # Split by newlines and process
201
+ lines = full_text.split('\n')
202
+ temp_blocks = []
203
+
204
+ for line in lines:
205
+ clean_line = " ".join(line.strip().split())
206
+ if len(clean_line) > 60: # Only keep substantial lines
207
+ if clean_line not in seen_texts:
208
+ temp_blocks.append(clean_line)
209
+ seen_texts.add(clean_line)
210
+
211
+ # Method 2: Also get specific HTML elements for better structure
212
+ for element in main_content.find_all(['p', 'div', 'li', 'h1', 'h2', 'h3', 'h4', 'blockquote'], recursive=True):
213
+ # Skip if likely author bio or footer
214
+ if is_likely_author_bio_or_footer(element, element.get_text()):
215
  continue
 
 
216
 
217
+ # For divs, skip if they contain other block elements (they're containers)
218
+ if element.name == 'div':
219
+ if element.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article', 'section']):
 
 
 
 
220
  continue
221
 
222
+ txt = " ".join(element.get_text(" ", strip=True).split())
223
+
224
+ # Add to blocks if substantial and not duplicate
225
+ if len(txt) > 60 and txt not in seen_texts:
226
+ blocks.append(txt)
227
+ seen_texts.add(txt)
228
+
229
+ # If we got blocks from method 2, use those (better structure)
230
+ # Otherwise, use the temp_blocks from method 1
231
+ if not blocks and temp_blocks:
232
+ blocks = temp_blocks[:max_paragraphs]
233
+ elif len(blocks) < max_paragraphs and temp_blocks:
234
+ # Combine both methods - add any unique blocks from temp_blocks
235
+ for tb in temp_blocks:
236
+ if tb not in seen_texts:
237
+ blocks.append(tb)
238
+ seen_texts.add(tb)
239
+ if len(blocks) >= max_paragraphs:
240
+ break
241
+
242
+ # Limit to max_paragraphs
243
+ blocks = blocks[:max_paragraphs]
244
+
245
+ # Debug output
246
+ print(f"\nExtracted {len(blocks)} blocks from {url}")
247
+ if blocks:
248
+ print(f"First block preview: {blocks[0][:200]}...")
249
+ # Check if we have reasonable content
250
+ full_extracted = " ".join(blocks)
251
+ print(f"Total extracted text length: {len(full_extracted)} chars")
252
+ else:
253
+ print("WARNING: No blocks extracted!")
254
 
255
  return blocks
256
 
 
310
  rewritten = f'{base}{clause}{punct}'
311
  return rewritten, False
312
 
 
 
 
 
 
 
 
 
 
313
  def find_alternative_anchor(blocks, target_url, original_anchor):
314
  """Find a better anchor text from the article that relates to the target URL."""
315
  try:
 
447
  if not blocks:
448
  return [{"error":"No text blocks found on the page."}]
449
 
450
+ # DEBUG: Print what we extracted
451
+ print("\n" + "="*50)
452
+ print(f"DEBUG: Looking for anchor: '{anchor_text}'")
453
+ print("="*50)
454
+
455
+ # Check if keyword is present in the article
456
  full_text = " ".join(blocks)
457
  full_text_lower = full_text.lower()
458
  anchor_text_lower = anchor_text.lower() if anchor_text else ""
459
+
460
+ # Multiple ways to check for the anchor
461
+ keyword_present = False
462
+
463
+ # Method 1: Direct case-insensitive search
464
+ if anchor_text_lower in full_text_lower:
465
+ keyword_present = True
466
+ print(f"Found anchor via direct search")
467
+
468
+ # Method 2: Normalized search (remove extra spaces)
469
  if not keyword_present:
470
+ normalized_full = re.sub(r'\s+', ' ', full_text_lower)
471
+ normalized_anchor = re.sub(r'\s+', ' ', anchor_text_lower)
472
+ if normalized_anchor in normalized_full:
473
+ keyword_present = True
474
+ print(f"Found anchor via normalized search")
475
+
476
+ # Method 3: Check each block individually
477
+ if not keyword_present:
478
+ for i, block in enumerate(blocks):
479
+ if anchor_text_lower in block.lower():
480
+ keyword_present = True
481
+ print(f"Found anchor in block {i}: {block[:100]}...")
482
+ break
483
+
484
+ print(f"Keyword present in article: {keyword_present}")
485
 
486
+ # Target context for similarity matching
487
  try:
488
  tgt_html = requests.get(target_url, timeout=20, headers=UA).text
489
  tt = BeautifulSoup(tgt_html, "html.parser").title
 
495
  ext = tldextract.extract(target_url)
496
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
497
 
498
+ # Find best match with original anchor
499
+ query = f"{anchor_text} β€” relevant to: {tgt_title} ({tgt_domain})"
500
 
501
  try:
502
  q_emb = embed([query])[0]
 
511
  results = []
512
  for idx in top_idx:
513
  try:
514
+ idx = min(idx, len(blocks)-1) # Ensure valid index
515
+ blk = blocks[idx]
516
 
517
  # Split sentences more carefully
518
  sents = re.split(r'(?<=[.!?])\s+', blk)
 
530
  if len(sents) > 0 and all(len(s) > 0 for s in sents):
531
  s_embs = embed(sents)
532
  s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
533
+ si = int(torch.argmax(s_sims).item())
534
  if 0 <= si < len(sents):
535
  best_sent = sents[si]
536
  except Exception as e:
 
541
  if not best_sent or len(best_sent.strip()) == 0:
542
  best_sent = blk if blk else "Unable to extract sentence from this section."
543
 
544
+ # Check if anchor is in THIS specific sentence
545
  sentence_lower = best_sent.lower()
546
  anchor_found_in_sentence = anchor_text_lower in sentence_lower
547
 
548
  # If not found with exact match, try normalized
549
  if not anchor_found_in_sentence:
550
+ normalized_sent = re.sub(r'\s+', ' ', sentence_lower)
551
+ normalized_anchor = re.sub(r'\s+', ' ', anchor_text_lower)
552
  anchor_found_in_sentence = normalized_anchor in normalized_sent
553
 
554
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
555
 
556
  result = {
557
+ "anchor_was_present": anchor_found_in_sentence,
558
  "best_sentence_original": best_sent,
559
  "best_sentence_with_anchor": rewritten_sent,
560
+ "keyword_in_article": keyword_present
561
  }
562
 
563
  # If anchor not present in article and alternative suggestion requested
 
581
 
582
  except Exception as e:
583
  print(f"Error processing block {idx}: {e}")
584
+ import traceback
585
+ traceback.print_exc()
586
  # Add a fallback result
587
  results.append({
588
  "anchor_was_present": False,
 
657
  "(2) Do NOT use an em dash or any dash. "
658
  '(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
659
  "Prefer integrating the anchor as part of the sentence. "
660
+ f"(4) Write in {language} and preserve ALL special characters (Δ‡, č, Ε‘, ΕΎ, Δ‘, etc.). "
661
  "Return a compact JSON object with key sentence_html only."
662
  )
663
 
 
750
  return html.unescape(text)
751
 
752
  # =========================
753
+ # Gradio UI
754
  # =========================
755
  def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
756
  if not source_url or not target_url or not anchor_text:
 
786
 
787
  # Check if anchor was already present in the article
788
  anchor_was_present = res.get("anchor_was_present", False)
789
+ keyword_in_article = res.get("keyword_in_article", False)
790
+
791
+ # If anchor is present in the article (even if not in the best sentence)
792
+ if keyword_in_article:
793
+ # Anchor exists somewhere in article
794
+ if anchor_was_present:
795
+ # Anchor is in the suggested sentence - just show where to add the link
796
+ final_output = to_plain_text(draft_html) if plain_text else draft_html
797
+ result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
798
+ result += f"πŸ“ Add link here:\n\n"
799
+ result += f"Original: {original_sentence}\n\n"
800
+ result += f"With link: {final_output}"
801
+ else:
802
+ # Anchor is in article but not in this sentence - show this sentence as an option
803
+ # and note that the anchor exists elsewhere
804
+ if smart_rewrite:
805
+ g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
806
+ final_html = g["sentence_html"]
807
+ else:
808
+ final_html = draft_html
809
+
810
+ polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
811
+ final_html = polished.get("sentence_html", final_html)
812
+ final_output = to_plain_text(final_html) if plain_text else final_html
813
+
814
+ result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
815
+ result += f"πŸ“ The anchor appears elsewhere in the article. Here's a contextually relevant placement:\n\n"
816
+ result += f"Original: {original_sentence}\n\n"
817
+ result += f"Suggested: {final_output}\n\n"
818
+ result += f"πŸ’‘ Note: You may want to search for '{anchor_text}' in the article to find where it naturally appears."
819
  else:
820
+ # Anchor doesn't exist in article at all - need to add it
 
821
  if smart_rewrite:
822
  g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral", language=language_name)
823
  final_html = g["sentence_html"]
824
  else:
825
  final_html = draft_html
826
 
 
827
  polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
828
  final_html = polished.get("sentence_html", final_html)
 
 
829
  final_output = to_plain_text(final_html) if plain_text else final_html
830
 
 
831
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
832
  result += f"πŸ“ Result 1 - Suggested placement:\n\n"
833
  result += f"Original: {original_sentence}\n\n"
834
  result += f"Suggested: {final_output}"
835
 
836
+ # Show alternative if requested and available
837
+ if suggest_alternative_anchor and res.get("alternative_anchor"):
 
 
 
838
  alt_anchor = res["alternative_anchor"]
839
  alt_sentence_original = res.get("alternative_sentence_original", "")
840
  alt_sentence = res.get("alternative_sentence", "")
 
895
  plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
896
  suggest_alternative_anchor = gr.Checkbox(
897
  label="Suggest alternative anchor",
898
+ value=True,
899
  info="If anchor not found, suggest a better anchor from the article"
900
  )
901
 
 
920
 
921
  gr.Markdown("""
922
  ### Features:
923
+ - 🌍 **Auto Language Detection**: Preserves special characters (Δ‡, č, Ε‘, ΕΎ, Δ‘, etc.)
924
  - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
925
  - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
926
  - πŸ”„ **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text