dusan-presswhizz commited on
Commit
4bc41f9
Β·
verified Β·
1 Parent(s): bfd5b70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -26
app.py CHANGED
@@ -226,7 +226,36 @@ def create_anchor_suggestion(anchor_text, target_url):
226
  ]
227
  return suggestions[0]
228
 
229
- def suggest_insertions(source_url, target_url, anchor_text, top_k=1, check_keyword_presence=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  blocks = get_text_blocks(source_url)
231
  if not blocks:
232
  return [{"error":"No text blocks found on the page."}]
@@ -235,16 +264,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, check_keywo
235
  full_text = " ".join(blocks).lower()
236
  keyword_present = anchor_text.lower() in full_text
237
 
238
- if check_keyword_presence and not keyword_present:
239
- # Return suggestion for adding new content
240
- suggestion = create_anchor_suggestion(anchor_text, target_url)
241
- return [{
242
- "keyword_not_found": True,
243
- "suggestion": suggestion,
244
- "anchor_text": anchor_text,
245
- "target_url": target_url
246
- }]
247
-
248
  # target context
249
  try:
250
  tgt_html = requests.get(target_url, timeout=20, headers=UA).text
@@ -256,6 +275,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, check_keywo
256
  ext = tldextract.extract(target_url)
257
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
258
 
 
259
  query = f"{anchor_text} β€” relevant to: {tgt_title} ({tgt_domain})"
260
  q_emb = embed([query])[0]
261
 
@@ -272,11 +292,42 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, check_keywo
272
  si = int(torch.argmax(s_sims))
273
  best_sent = sents[si]
274
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
275
- results.append({
 
276
  "anchor_was_present": exact_found,
277
  "best_sentence_original": best_sent,
278
- "best_sentence_with_anchor": rewritten_sent
279
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  return results
281
 
282
  # =========================
@@ -426,7 +477,7 @@ def to_plain_text(html_or_text):
426
  # =========================
427
  # Gradio UI
428
  # =========================
429
- def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, check_keyword):
430
  if not source_url or not target_url or not anchor_text:
431
  return "❌ Please provide Source URL, Target URL, and Anchor Text."
432
 
@@ -440,23 +491,20 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, che
440
  target_url = normalize_url(target_url)
441
 
442
  try:
443
- res = suggest_insertions(source_url, target_url, anchor_text, top_k=1, check_keyword_presence=check_keyword)[0]
444
  except Exception as e:
445
  return f"❌ Error processing the page: {str(e)}"
446
 
447
  if "error" in res:
448
  return f"❌ {res['error']}"
449
 
450
- # Handle case where keyword is not found
451
- if res.get("keyword_not_found"):
452
- return f"⚠️ Keyword '{anchor_text}' not found in the article.\n\n{res['suggestion']}"
453
-
454
  # Detect language from the original sentence
455
  original_sentence = res['best_sentence_original']
456
  detected_lang = detect_language(original_sentence)
457
  language_name = get_language_name(detected_lang)
458
  print(f"Detected language: {language_name} ({detected_lang})")
459
 
 
460
  draft_html = res["best_sentence_with_anchor"]
461
 
462
  # 1) Optional first-pass rewrite with language support
@@ -474,10 +522,38 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, che
474
  # 3) Optionally convert to plain text
475
  final_output = to_plain_text(final_html) if plain_text else final_html
476
 
 
477
  if res.get("anchor_was_present", False):
478
- return warn + f"βœ… Add link here:\n\n{final_output}"
479
  else:
480
- return warn + f"Change this sentence:\n\n{original_sentence}\n\nWith this one:\n\n{final_output}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
  def clear_cache():
483
  """Clear all caches."""
@@ -505,8 +581,8 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
505
  with gr.Row():
506
  smart_rewrite = gr.Checkbox(label="Smart rewrite (GPT)", value=True)
507
  plain_text = gr.Checkbox(label="Plain text (no URL)", value=False)
508
- check_keyword = gr.Checkbox(label="Check keyword presence", value=False,
509
- info="If enabled, suggests where to add new content when keyword is not found")
510
 
511
  with gr.Row():
512
  submit_btn = gr.Button("Process", variant="primary")
@@ -518,7 +594,7 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
518
 
519
  submit_btn.click(
520
  fn=run_tool,
521
- inputs=[source_url, target_url, anchor_text, smart_rewrite, plain_text, check_keyword],
522
  outputs=output
523
  )
524
 
@@ -532,7 +608,7 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
532
  - 🌍 **Auto Language Detection**: Preserves special characters (č, Δ‡, Ε‘, ΕΎ, Δ‘, etc.)
533
  - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
534
  - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
535
- - πŸ” **Keyword Check**: Optional detection when anchor text is not in article
536
  - ✨ **Natural Integration**: AI-powered sentence rewriting for seamless link placement
537
  """)
538
 
 
226
  ]
227
  return suggestions[0]
228
 
229
+ def extract_potential_anchors(sentence, target_url):
230
+ """Extract potential anchor text phrases from a sentence."""
231
+ # Remove very common words and extract meaningful phrases
232
+ stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
233
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
234
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'}
235
+
236
+ # Split into words and find continuous phrases
237
+ words = sentence.split()
238
+ phrases = []
239
+
240
+ # Generate phrases of 2-5 words
241
+ for length in range(2, min(6, len(words) + 1)):
242
+ for i in range(len(words) - length + 1):
243
+ phrase = ' '.join(words[i:i+length])
244
+ # Check if phrase doesn't start/end with stopwords
245
+ first_word = words[i].lower().strip('.,!?;:')
246
+ last_word = words[i+length-1].lower().strip('.,!?;:')
247
+ if first_word not in stopwords and last_word not in stopwords:
248
+ phrases.append(phrase.strip('.,!?;:'))
249
+
250
+ # Also add significant single words (proper nouns, long words)
251
+ for word in words:
252
+ clean_word = word.strip('.,!?;:')
253
+ if (len(clean_word) > 7 or clean_word[0].isupper()) and clean_word.lower() not in stopwords:
254
+ phrases.append(clean_word)
255
+
256
+ return phrases[:5] # Return top 5 potential anchors
257
+
258
+ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
259
  blocks = get_text_blocks(source_url)
260
  if not blocks:
261
  return [{"error":"No text blocks found on the page."}]
 
264
  full_text = " ".join(blocks).lower()
265
  keyword_present = anchor_text.lower() in full_text
266
 
 
 
 
 
 
 
 
 
 
 
267
  # target context
268
  try:
269
  tgt_html = requests.get(target_url, timeout=20, headers=UA).text
 
275
  ext = tldextract.extract(target_url)
276
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
277
 
278
+ # First, find best match with original anchor
279
  query = f"{anchor_text} β€” relevant to: {tgt_title} ({tgt_domain})"
280
  q_emb = embed([query])[0]
281
 
 
292
  si = int(torch.argmax(s_sims))
293
  best_sent = sents[si]
294
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
295
+
296
+ result = {
297
  "anchor_was_present": exact_found,
298
  "best_sentence_original": best_sent,
299
+ "best_sentence_with_anchor": rewritten_sent,
300
+ "keyword_in_article": keyword_present
301
+ }
302
+
303
+ # If anchor not present and alternative suggestion requested
304
+ if suggest_alternative and not keyword_present:
305
+ # Extract potential anchor phrases from the best sentence
306
+ potential_anchors = extract_potential_anchors(best_sent, target_url)
307
+
308
+ # Find the best alternative anchor
309
+ best_alternative = None
310
+ best_alt_score = -1
311
+
312
+ for alt_anchor in potential_anchors:
313
+ # Check relevance to target
314
+ alt_query = f"{alt_anchor} β€” relevant to: {tgt_title}"
315
+ alt_q_emb = embed([alt_query])[0]
316
+ alt_sim = F.cosine_similarity(alt_q_emb.unsqueeze(0), q_emb.unsqueeze(0)).item()
317
+
318
+ if alt_sim > best_alt_score:
319
+ best_alt_score = alt_sim
320
+ best_alternative = alt_anchor
321
+
322
+ if best_alternative:
323
+ # Create alternative suggestion with the better anchor
324
+ alt_rewritten, alt_exact = inject_anchor_into_sentence(best_sent, best_alternative, target_url)
325
+ result["alternative_anchor"] = best_alternative
326
+ result["alternative_sentence"] = alt_rewritten
327
+ result["alternative_exact_match"] = alt_exact
328
+
329
+ results.append(result)
330
+
331
  return results
332
 
333
  # =========================
 
477
  # =========================
478
  # Gradio UI
479
  # =========================
480
+ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
481
  if not source_url or not target_url or not anchor_text:
482
  return "❌ Please provide Source URL, Target URL, and Anchor Text."
483
 
 
491
  target_url = normalize_url(target_url)
492
 
493
  try:
494
+ res = suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=suggest_alternative_anchor)[0]
495
  except Exception as e:
496
  return f"❌ Error processing the page: {str(e)}"
497
 
498
  if "error" in res:
499
  return f"❌ {res['error']}"
500
 
 
 
 
 
501
  # Detect language from the original sentence
502
  original_sentence = res['best_sentence_original']
503
  detected_lang = detect_language(original_sentence)
504
  language_name = get_language_name(detected_lang)
505
  print(f"Detected language: {language_name} ({detected_lang})")
506
 
507
+ # Process original anchor suggestion
508
  draft_html = res["best_sentence_with_anchor"]
509
 
510
  # 1) Optional first-pass rewrite with language support
 
522
  # 3) Optionally convert to plain text
523
  final_output = to_plain_text(final_html) if plain_text else final_html
524
 
525
+ # Build the main result
526
  if res.get("anchor_was_present", False):
527
+ result = warn + f"βœ… Add link here:\n\n{final_output}"
528
  else:
529
+ result = warn + f"πŸ“ OPTION 1 - Your anchor text:\n\nChange this sentence:\n{original_sentence}\n\nWith this one:\n{final_output}"
530
+
531
+ # Process alternative anchor if requested and original anchor not found
532
+ if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
533
+ alt_anchor = res["alternative_anchor"]
534
+ alt_sentence = res["alternative_sentence"]
535
+
536
+ # Apply GPT rewriting to alternative as well
537
+ if smart_rewrite:
538
+ alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=language_name)
539
+ alt_final = alt_g["sentence_html"]
540
+ else:
541
+ alt_final = alt_sentence
542
+
543
+ # Polish if needed
544
+ if not res.get("alternative_exact_match", False):
545
+ alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=language_name)
546
+ alt_final = alt_polished.get("sentence_html", alt_final)
547
+
548
+ alt_output = to_plain_text(alt_final) if plain_text else alt_final
549
+
550
+ result += f"\n\n{'='*50}\n\n"
551
+ result += f"πŸ’‘ OPTION 2 - Better anchor suggestion:\n\n"
552
+ result += f"Since '{anchor_text}' is not in the article, consider using:\n"
553
+ result += f"Suggested anchor: '{alt_anchor}'\n\n"
554
+ result += f"Change this sentence:\n{original_sentence}\n\nWith this one:\n{alt_output}"
555
+
556
+ return result
557
 
558
  def clear_cache():
559
  """Clear all caches."""
 
581
  with gr.Row():
582
  smart_rewrite = gr.Checkbox(label="Smart rewrite (GPT)", value=True)
583
  plain_text = gr.Checkbox(label="Plain text (no URL)", value=False)
584
+ suggest_alternative_anchor = gr.Checkbox(label="Suggest alternative anchor", value=False,
585
+ info="If anchor not found, suggest a better anchor from the article")
586
 
587
  with gr.Row():
588
  submit_btn = gr.Button("Process", variant="primary")
 
594
 
595
  submit_btn.click(
596
  fn=run_tool,
597
+ inputs=[source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor],
598
  outputs=output
599
  )
600
 
 
608
  - 🌍 **Auto Language Detection**: Preserves special characters (č, Δ‡, Ε‘, ΕΎ, Δ‘, etc.)
609
  - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
610
  - 🎯 **Main Content Focus**: Prioritizes first 5-7 paragraphs, ignores author bios
611
+ - πŸ”„ **Alternative Anchor Suggestion**: When your anchor isn't in the article, suggests better anchors from existing text
612
  - ✨ **Natural Integration**: AI-powered sentence rewriting for seamless link placement
613
  """)
614