dusan-presswhizz commited on
Commit
74325d3
Β·
verified Β·
1 Parent(s): bb074b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -149
app.py CHANGED
@@ -262,134 +262,63 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
262
  return rewritten, False
263
 
264
  def find_alternative_anchor(blocks, target_url, original_anchor):
265
- """Find a better anchor text from the article that relates to the target URL."""
 
 
266
  try:
267
- # Get target page context
268
- try:
269
- tgt_html = requests.get(target_url, timeout=20, headers=UA).text
270
- soup = BeautifulSoup(tgt_html, "html.parser")
271
-
272
- # Extract target page title and meta description
273
- title = soup.title.get_text().strip() if soup.title else ""
274
- meta_desc = ""
275
- meta_tag = soup.find("meta", attrs={"name": "description"})
276
- if meta_tag:
277
- meta_desc = meta_tag.get("content", "")
278
-
279
- # Extract key terms from target page (first few paragraphs)
280
- target_paragraphs = []
281
- for p in soup.find_all("p")[:5]:
282
- text = p.get_text().strip()
283
- if len(text) > 50:
284
- target_paragraphs.append(text)
285
- target_content = " ".join(target_paragraphs[:3])
286
-
287
- except Exception as e:
288
- print(f"Error fetching target URL: {e}")
289
- title = ""
290
- meta_desc = ""
291
- target_content = original_anchor
292
-
293
- # Extract all potential anchor phrases from the source article
294
- all_phrases = set()
295
- full_text = " ".join(blocks)
296
 
297
- # Common words to exclude
298
- stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
299
- 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
300
- 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
301
- 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
302
- 'po', 'iz', 'Δ‡e', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'Ε‘to'}
303
 
304
- # Extract noun phrases and important terms (2-4 words)
305
- sentences = re.split(r'[.!?]', full_text)
306
- for sentence in sentences:
307
- if not sentence:
308
- continue
309
- words = sentence.split()
310
-
311
- # Extract phrases of 2-4 words
312
- for length in range(2, min(5, len(words) + 1)):
313
- for i in range(len(words) - length + 1):
314
- if i < 0 or i+length > len(words):
315
- continue
316
- phrase = ' '.join(words[i:i+length])
317
- phrase_clean = phrase.strip('.,!?;:"\' ')
318
-
319
- # Check if phrase is meaningful
320
- if i < len(words) and i+length-1 < len(words):
321
- first_word = words[i].lower().strip('.,!?;:')
322
- last_word = words[i+length-1].lower().strip('.,!?;:')
323
-
324
- # Skip if starts/ends with stopwords or is too short
325
- if (first_word not in stopwords and
326
- last_word not in stopwords and
327
- len(phrase_clean) > 5 and
328
- len(phrase_clean) < 50):
329
- all_phrases.add(phrase_clean)
330
-
331
- # Also extract single important words (proper nouns, long words)
332
- for word in words:
333
- clean_word = word.strip('.,!?;:"\' ')
334
- if clean_word and (len(clean_word) > 6 or
335
- (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
336
- all_phrases.add(clean_word)
337
 
338
- if not all_phrases:
 
339
  return None, None
340
 
341
- # Create context query from target URL info
342
- target_context = f"{title} {meta_desc} {target_content}"[:500]
 
 
 
343
 
344
- # Score each phrase based on relevance to target
345
- try:
346
- target_emb = embed([target_context])[0]
347
- except:
 
 
 
 
 
 
348
  return None, None
349
 
350
- best_anchor = None
351
- best_score = -1
352
- best_sentence = None
 
 
353
 
354
- # Evaluate each potential anchor
355
- for phrase in list(all_phrases)[:50]: # Limit to first 50 to avoid too much processing
356
- # Skip if too similar to original anchor (we want something different)
357
- if phrase.lower() == original_anchor.lower():
358
- continue
359
-
360
- try:
361
- # Score this phrase against target context
362
- phrase_emb = embed([phrase])[0]
363
- relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
364
-
365
- # Check if this phrase appears in article and find its best context
366
- if phrase.lower() in full_text.lower():
367
- # Find sentences containing this phrase
368
- for block in blocks:
369
- if phrase.lower() in block.lower():
370
- sents = re.split(r'(?<=[.!?])\s+', block)
371
- for sent in sents:
372
- if sent and phrase.lower() in sent.lower():
373
- # Score this sentence-phrase combination
374
- try:
375
- sent_emb = embed([sent])[0]
376
- context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
377
- combined_score = (relevance_score * 0.6) + (context_score * 0.4)
378
-
379
- if combined_score > best_score:
380
- best_score = combined_score
381
- best_anchor = phrase
382
- best_sentence = sent
383
- except:
384
- continue
385
- except Exception as e:
386
- print(f"Error evaluating phrase '{phrase}': {e}")
387
- continue
388
 
389
- return best_anchor, best_sentence
390
 
391
  except Exception as e:
392
- print(f"Critical error in find_alternative_anchor: {e}")
 
 
393
  return None, None
394
 
395
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
@@ -514,18 +443,16 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
514
  # If anchor not present in article and alternative suggestion requested
515
  if suggest_alternative and not keyword_present:
516
  try:
517
- # Find a completely different anchor and sentence
518
- alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
519
 
520
- if alt_anchor and alt_sentence:
521
- # Create the sentence with the alternative anchor
522
- alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
523
  result["alternative_anchor"] = alt_anchor
524
- result["alternative_sentence_original"] = alt_sentence
525
- result["alternative_sentence"] = alt_rewritten
526
- result["alternative_exact_match"] = alt_exact
527
  except Exception as e:
528
- print(f"Error finding alternative anchor: {e}")
529
  # Continue without alternative
530
 
531
  results.append(result)
@@ -694,6 +621,97 @@ def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="En
694
 
695
  return {"sentence_html": out}
696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  def to_plain_text(html_or_text):
698
  """Convert HTML to plain text, properly handling special characters."""
699
  text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
@@ -746,7 +764,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
746
  # Anchor is in the suggested sentence - just show where to add the link
747
  final_output = to_plain_text(draft_html) if plain_text else draft_html
748
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
749
- result += f"πŸ“ Add link here:\n\n"
750
  result += f"{final_output}"
751
  else:
752
  # Anchor is in article but not in this sentence
@@ -761,7 +779,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
761
  final_output = to_plain_text(final_html) if plain_text else final_html
762
 
763
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
764
- result += f"πŸ“ Add link here:\n\n"
765
  result += f"{final_output}"
766
  else:
767
  # Anchor doesn't exist in article at all - need to add it
@@ -776,41 +794,35 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
776
  final_output = to_plain_text(final_html) if plain_text else final_html
777
 
778
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
779
- result += f"πŸ“ Result 1 - Suggested placement:\n\n"
780
  result += f"Original: {original_sentence}\n\n"
781
  result += f"Suggested: {final_output}"
782
 
783
  # Show alternative if requested and available
784
  if suggest_alternative_anchor and res.get("alternative_anchor"):
785
  alt_anchor = res["alternative_anchor"]
786
- alt_sentence_original = res.get("alternative_sentence_original", "")
787
- alt_sentence = res.get("alternative_sentence", "")
788
 
789
- # Detect language for alternative sentence
790
- if alt_sentence_original:
791
- alt_detected_lang = detect_language(alt_sentence_original)
792
- alt_language_name = get_language_name(alt_detected_lang)
793
-
794
- # Apply GPT rewriting to alternative as well
795
- if smart_rewrite and alt_sentence:
796
- alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
797
- alt_final = alt_g["sentence_html"]
798
  else:
799
- alt_final = alt_sentence
800
-
801
- # Polish if needed
802
- if not res.get("alternative_exact_match", False):
803
- alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
804
- alt_final = alt_polished.get("sentence_html", alt_final)
805
 
806
- alt_output = to_plain_text(alt_final) if plain_text else alt_final
 
807
 
808
  # Add alternative as Result 2
809
  result += f"\n\n{'='*50}\n\n"
810
- result += f"πŸ“ Result 2 - Alternative from article:\n"
811
- result += f"πŸ’‘ Alternative anchor: '{alt_anchor}'\n\n"
812
- result += f"Original: {alt_sentence_original}\n\n"
813
- result += f"Suggested: {alt_output}"
 
814
 
815
  return result
816
 
 
262
  return rewritten, False
263
 
264
  def find_alternative_anchor(blocks, target_url, original_anchor):
265
+ """
266
+ NEW VERSION: Generate new content with keywords from target page.
267
+ """
268
  try:
269
+ print(f"[Alternative] Extracting target page content from {target_url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ # Step 1: Extract target page content using Trafilatura
272
+ target_blocks = get_text_blocks(target_url, max_paragraphs=5)
 
 
 
 
273
 
274
+ if not target_blocks:
275
+ print("[Alternative] No content extracted from target page")
276
+ return None, None
277
+
278
+ print(f"[Alternative] Extracted {len(target_blocks)} blocks from target")
279
+
280
+ # Step 2: Get search keywords from target content
281
+ keywords = gpt_get_search_keywords(target_blocks, target_url)
282
+ print(f"[Alternative] Keywords identified: {keywords}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ if not keywords or not isinstance(keywords, list):
285
+ print("[Alternative] No valid keywords returned")
286
  return None, None
287
 
288
+ # Step 3: Detect language from source article
289
+ source_text = " ".join(blocks[:2]) # Use first 2 paragraphs for detection
290
+ detected_lang = detect_language(source_text)
291
+ language_name = get_language_name(detected_lang)
292
+ print(f"[Alternative] Detected language: {language_name}")
293
 
294
+ # Step 4: Generate new content with keyword
295
+ result = gpt_generate_content_with_keyword(
296
+ source_blocks=blocks,
297
+ keywords=keywords,
298
+ target_url=target_url,
299
+ language=language_name
300
+ )
301
+
302
+ if not result:
303
+ print("[Alternative] Content generation failed")
304
  return None, None
305
 
306
+ # Return in format compatible with existing code
307
+ chosen_keyword = result.get("chosen_keyword", keywords[0] if keywords else original_anchor)
308
+ new_content = result.get("new_content", "")
309
+ insert_after = result.get("insert_after_paragraph", 0)
310
+ reasoning = result.get("reasoning", "")
311
 
312
+ # Format the response for compatibility
313
+ # Return: (anchor_text, formatted_content_with_position)
314
+ position_text = f"[Insert after paragraph {insert_after + 1}]: {reasoning}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ return chosen_keyword, f"{position_text}\n\n{new_content}"
317
 
318
  except Exception as e:
319
+ print(f"[Alternative] Critical error: {e}")
320
+ import traceback
321
+ traceback.print_exc()
322
  return None, None
323
 
324
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
 
443
  # If anchor not present in article and alternative suggestion requested
444
  if suggest_alternative and not keyword_present:
445
  try:
446
+ # Generate new content with keywords from target page
447
+ alt_anchor, alt_content = find_alternative_anchor(blocks, target_url, anchor_text)
448
 
449
+ if alt_anchor and alt_content:
 
 
450
  result["alternative_anchor"] = alt_anchor
451
+ result["alternative_sentence_original"] = "" # No original since it's new content
452
+ result["alternative_sentence"] = alt_content
453
+ result["alternative_exact_match"] = True # It's generated with the link
454
  except Exception as e:
455
+ print(f"Error generating alternative content: {e}")
456
  # Continue without alternative
457
 
458
  results.append(result)
 
621
 
622
  return {"sentence_html": out}
623
 
624
+ def gpt_get_search_keywords(target_content, target_url):
625
+ """
626
+ Analyze target page content and get search keywords people would use.
627
+ """
628
+ if not OPENAI_API_KEY:
629
+ return ["related content", "learn more", "additional information"]
630
+
631
+ # Create cache key
632
+ cache_key = hashlib.md5(f"keywords_{target_url}_{target_content[:500]}".encode()).hexdigest()
633
+
634
+ system = (
635
+ "You are an SEO expert. Analyze the provided web page content and identify "
636
+ "5-10 search keywords or phrases that people would typically use to find this page. "
637
+ "Focus on practical, real search terms that users would type into Google. "
638
+ "Return a JSON object with a 'keywords' array containing 5-10 keyword phrases."
639
+ )
640
+
641
+ # Limit content to avoid token limits
642
+ content_preview = " ".join(target_content[:5]) if isinstance(target_content, list) else target_content[:3000]
643
+
644
+ user = {
645
+ "task": "identify_search_keywords",
646
+ "page_content": content_preview,
647
+ "url": target_url,
648
+ "requirements": {
649
+ "count": "5-10 keywords",
650
+ "type": "practical search terms",
651
+ "focus": "what users would actually search for"
652
+ }
653
+ }
654
+
655
+ try:
656
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
657
+ except Exception as e:
658
+ print(f"[GPT] Keywords extraction failed: {e}")
659
+ return ["related content", "learn more", "additional information"]
660
+
661
+ return obj.get("keywords", ["related content"])
662
+
663
+ def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
664
+ """
665
+ Generate new content with the best keyword and specify where to insert it.
666
+ """
667
+ if not OPENAI_API_KEY or not keywords:
668
+ return None
669
+
670
+ # Create cache key
671
+ source_preview = " ".join(source_blocks[:3])[:500]
672
+ cache_key = hashlib.md5(f"generate_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
673
+
674
+ system = (
675
+ f"You are a skilled content writer writing in {language}. "
676
+ "Given an article and a list of keywords related to a target page, "
677
+ "create a NATURAL addition to the article that incorporates the most suitable keyword. "
678
+ "The addition should flow seamlessly with the existing content. "
679
+ "\n\nYOUR TASK:\n"
680
+ "1. Choose the ONE keyword that fits most naturally with the article's context\n"
681
+ "2. Create new content (1-3 sentences OR a paragraph if needed) that naturally includes this keyword\n"
682
+ "3. Specify AFTER which paragraph number (0-based) to insert this content\n"
683
+ "4. The keyword should be wrapped in an HTML link to the target URL\n"
684
+ f"5. Write in {language} and preserve special characters\n"
685
+ "\n\nReturn JSON with keys:\n"
686
+ "- 'chosen_keyword': the keyword you selected\n"
687
+ "- 'new_content': the HTML content with <a href> link\n"
688
+ "- 'insert_after_paragraph': paragraph number (0-based) after which to insert\n"
689
+ "- 'reasoning': brief explanation of placement choice"
690
+ )
691
+
692
+ user = {
693
+ "article_paragraphs": source_blocks[:7], # First 7 paragraphs for context
694
+ "available_keywords": keywords,
695
+ "target_url": target_url,
696
+ "language": language,
697
+ "requirements": {
698
+ "natural_flow": True,
699
+ "include_link": True,
700
+ "preserve_tone": True
701
+ }
702
+ }
703
+
704
+ try:
705
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
706
+ return obj
707
+ except Exception as e:
708
+ print(f"[GPT] Content generation failed: {e}")
709
+ try:
710
+ obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
711
+ return obj
712
+ except:
713
+ return None
714
+
715
  def to_plain_text(html_or_text):
716
  """Convert HTML to plain text, properly handling special characters."""
717
  text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
 
764
  # Anchor is in the suggested sentence - just show where to add the link
765
  final_output = to_plain_text(draft_html) if plain_text else draft_html
766
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
767
+ result += f"πŸ”— Add link here:\n\n"
768
  result += f"{final_output}"
769
  else:
770
  # Anchor is in article but not in this sentence
 
779
  final_output = to_plain_text(final_html) if plain_text else final_html
780
 
781
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
782
+ result += f"πŸ”— Add link here:\n\n"
783
  result += f"{final_output}"
784
  else:
785
  # Anchor doesn't exist in article at all - need to add it
 
794
  final_output = to_plain_text(final_html) if plain_text else final_html
795
 
796
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
797
+ result += f"πŸ”— Result 1 - Suggested placement:\n\n"
798
  result += f"Original: {original_sentence}\n\n"
799
  result += f"Suggested: {final_output}"
800
 
801
  # Show alternative if requested and available
802
  if suggest_alternative_anchor and res.get("alternative_anchor"):
803
  alt_anchor = res["alternative_anchor"]
804
+ alt_content = res.get("alternative_sentence", "") # This now contains position info + content
 
805
 
806
+ if alt_content:
807
+ # Parse if there's position information
808
+ if "[Insert after paragraph" in alt_content:
809
+ parts = alt_content.split("\n\n", 1)
810
+ position_info = parts[0] if len(parts) > 0 else ""
811
+ actual_content = parts[1] if len(parts) > 1 else alt_content
 
 
 
812
  else:
813
+ position_info = ""
814
+ actual_content = alt_content
 
 
 
 
815
 
816
+ # The content already has the link included from GPT
817
+ alt_output = to_plain_text(actual_content) if plain_text else actual_content
818
 
819
  # Add alternative as Result 2
820
  result += f"\n\n{'='*50}\n\n"
821
+ result += f"πŸ”— Result 2 - Suggested new content to add:\n"
822
+ result += f"πŸ’‘ Using keyword: '{alt_anchor}'\n"
823
+ if position_info:
824
+ result += f"πŸ“ {position_info}\n"
825
+ result += f"\n{alt_output}"
826
 
827
  return result
828