dusan-presswhizz commited on
Commit
6e710b3
·
verified ·
1 Parent(s): 8b2a25c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -166
app.py CHANGED
@@ -261,163 +261,136 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
261
  rewritten = f'{base}{clause}{punct}'
262
  return rewritten, False
263
 
264
- def find_alternative_anchor(blocks, target_url, original_anchor, target_context=None):
265
  """Find a better anchor text from the article that relates to the target URL."""
266
  try:
267
- # Use provided target context or analyze the URL
268
- if not target_context:
269
- target_context = analyze_target_url(target_url)
270
-
271
- # Extract key concepts from target page to understand what it's about
272
- target_title = target_context.get("title", "").lower()
273
- target_meta = target_context.get("meta_description", "").lower()
274
- target_headings = " ".join(target_context.get("headings", [])).lower()
275
- target_content = target_context.get("main_content", "").lower()
276
-
277
- # Extract important keywords from target page (what the page is ACTUALLY about)
278
- target_keywords = set()
279
-
280
- # Common service/product related terms from the target
281
- important_words = []
282
- for text in [target_title, target_meta, target_headings, target_content[:500]]:
283
- # Extract meaningful nouns and phrases
284
- words = text.split()
285
- for word in words:
286
- clean = word.strip('.,!?;:"\'()[]{}')
287
- if len(clean) > 3 and clean.isalpha():
288
- important_words.append(clean)
289
- if len(clean) > 4: # Add to keywords
290
- target_keywords.add(clean)
291
-
292
- print(f"\nTarget page keywords detected: {list(target_keywords)[:10]}")
293
-
294
- # Now search for phrases in source article
295
  full_text = " ".join(blocks)
296
- sentences = re.split(r'[.!?]', full_text)
297
 
298
- candidate_anchors = {} # phrase -> (sentence, score, needs_bridge)
 
 
 
 
 
299
 
 
 
300
  for sentence in sentences:
301
- if not sentence or len(sentence.strip()) < 20:
302
  continue
303
-
304
- sentence_lower = sentence.lower()
305
  words = sentence.split()
306
 
307
- # Look for ALL potential phrases, even loosely related ones
308
- # Single important words
309
- for word in words:
310
- clean_word = word.strip('.,!?;:"\'()[]{}')
311
- if (len(clean_word) > 4 and clean_word.isalpha()):
312
- # Calculate relevance even for loose matches
313
- try:
314
- word_emb = embed([clean_word])[0]
315
- target_emb = embed([target_context.get("summary", "")[:500]])[0]
316
- semantic_score = F.cosine_similarity(
317
- word_emb.unsqueeze(0),
318
- target_emb.unsqueeze(0)
319
- ).item()
320
-
321
- # Lower threshold for considering candidates
322
- if semantic_score > 0.15: # Much lower threshold
323
- needs_bridge = semantic_score < 0.3 # Mark if needs bridge content
324
- if clean_word not in candidate_anchors or candidate_anchors[clean_word][1] < semantic_score:
325
- candidate_anchors[clean_word] = (sentence.strip(), semantic_score, needs_bridge)
326
- except:
327
- continue
328
-
329
- # Look for 2-4 word phrases
330
  for length in range(2, min(5, len(words) + 1)):
331
  for i in range(len(words) - length + 1):
332
- if i < 0 or i + length > len(words):
333
- continue
334
-
335
- phrase_words = words[i:i+length]
336
- phrase = ' '.join(phrase_words)
337
- phrase_clean = phrase.strip('.,!?;:"\'()')
338
-
339
- # More lenient filtering
340
- skip_words = {'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then', 'than'}
341
-
342
- first_word = phrase_words[0].lower().strip('.,!?;:"\'')
343
- last_word = phrase_words[-1].lower().strip('.,!?;:"\'')
344
-
345
- # Allow more phrases through
346
- if (len(phrase_clean) < 5 or
347
- len(phrase_clean) > 50 or
348
- not phrase_clean[0].isalpha()):
349
  continue
 
 
350
 
351
- # Skip only the worst fragments
352
- if first_word in skip_words and last_word in skip_words:
353
- continue
354
-
355
- # Calculate relevance score
356
- try:
357
- phrase_emb = embed([phrase_clean])[0]
358
- target_emb = embed([target_context.get("summary", "")[:500]])[0]
359
- semantic_score = F.cosine_similarity(
360
- phrase_emb.unsqueeze(0),
361
- target_emb.unsqueeze(0)
362
- ).item()
363
 
364
- # Accept even loosely related phrases
365
- if semantic_score > 0.15: # Much lower threshold
366
- needs_bridge = semantic_score < 0.3 # Mark if needs bridge
367
-
368
- # Check for topic-related words (beauty, skincare, nail, etc.)
369
- bonus = 0
370
- general_beauty_terms = ['beauty', 'skincare', 'cosmetic', 'product', 'treatment',
371
- 'care', 'skin', 'nail', 'makeup', 'store', 'shop',
372
- 'korean', 'k-beauty', 'routine', 'regimen']
373
- for term in general_beauty_terms:
374
- if term in phrase_clean.lower():
375
- bonus = 0.1
376
- break
377
-
378
- total_score = semantic_score + bonus
379
-
380
- if phrase_clean not in candidate_anchors or candidate_anchors[phrase_clean][1] < total_score:
381
- candidate_anchors[phrase_clean] = (sentence.strip(), total_score, needs_bridge)
382
- if total_score > 0.2: # Only print decent candidates
383
- print(f" Candidate: '{phrase_clean}' (score: {total_score:.3f}, needs_bridge: {needs_bridge})")
384
- except:
385
- continue
386
 
387
- # If no candidates at all, try to find ANY noun phrase in the article
388
- if not candidate_anchors:
389
- print("\nNo semantic matches found, looking for any noun phrases...")
390
- for sentence in sentences[:10]: # Check first 10 sentences
391
- words = sentence.split()
392
- for word in words:
393
- clean_word = word.strip('.,!?;:"\'()[]{}')
394
- # Any proper noun or long word
395
- if clean_word and len(clean_word) > 5 and clean_word[0].isupper():
396
- candidate_anchors[clean_word] = (sentence.strip(), 0.1, True) # Low score, needs bridge
397
- break
398
- if candidate_anchors:
399
- break
400
 
401
- # Select the best anchor from candidates
402
- if not candidate_anchors:
403
- print("\n✗ No alternative anchor found at all")
404
- return None, None, False
405
 
406
- # Sort by score and get the best one
407
- sorted_candidates = sorted(candidate_anchors.items(), key=lambda x: x[1][1], reverse=True)
408
- best_anchor, (best_sentence, best_score, needs_bridge) = sorted_candidates[0]
 
 
 
 
 
 
409
 
410
- print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
411
- if needs_bridge:
412
- print(f" → Will need bridge paragraph to connect to target topic")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
414
- return best_anchor, best_sentence, needs_bridge
415
 
416
  except Exception as e:
417
  print(f"Critical error in find_alternative_anchor: {e}")
418
- import traceback
419
- traceback.print_exc()
420
- return None, None, False
421
 
422
  except Exception as e:
423
  print(f"Critical error in find_alternative_anchor: {e}")
@@ -669,8 +642,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
669
  if suggest_alternative and not keyword_present:
670
  try:
671
  # Find a completely different anchor and sentence
672
- # Pass the target_context we already analyzed
673
- alt_anchor, alt_sentence, needs_bridge = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
674
 
675
  if alt_anchor and alt_sentence:
676
  # Create the sentence with the alternative anchor
@@ -679,7 +651,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
679
  result["alternative_sentence_original"] = alt_sentence
680
  result["alternative_sentence"] = alt_rewritten
681
  result["alternative_exact_match"] = alt_exact
682
- result["needs_bridge_paragraph"] = needs_bridge
683
  except Exception as e:
684
  print(f"Error finding alternative anchor: {e}")
685
  # Continue without alternative
@@ -798,51 +769,89 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
798
  # Don't check for exact anchor text match as it might have special chars
799
  return {"sentence_html": out}
800
 
801
- def gpt_create_bridge_paragraph(anchor_text, sentence, target_url, target_context, language="English"):
802
- """Create a bridge paragraph that naturally connects loosely related topics."""
803
  if not OPENAI_API_KEY:
804
- return {"paragraph": sentence}
805
 
806
  # Create cache key
807
- cache_key = hashlib.md5(f"bridge_{anchor_text}{sentence}{target_url}{language}".encode()).hexdigest()
 
 
 
 
808
 
809
- target_title = target_context.get("title", "")
810
- target_topic = target_context.get("meta_description", "")
 
811
 
812
  system = (
813
- f"You are a skilled content writer writing in {language}. "
814
- f"IMPORTANT: Preserve all special characters and diacritics from the {language} language. "
815
- "Your task is to create a natural bridge paragraph that connects two loosely related topics. "
816
- "The paragraph should flow naturally from the source topic to the target topic. "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  "RULES: "
818
- "(1) Start with the context from the source article "
819
- "(2) Create a natural transition to the target topic "
820
- "(3) Include the anchor link naturally "
821
- "(4) Make it 2-3 sentences that feel organic, not forced "
822
- "(5) Avoid obvious transitions like 'Speaking of...' or 'On a related note...' "
823
- "Return JSON with key 'paragraph' containing the HTML with the link included."
 
 
824
  )
825
 
826
  user = {
827
- "task": "create_bridge_paragraph",
828
- "source_context": sentence,
829
- "anchor_text": anchor_text,
830
  "target_url": target_url,
831
- "target_title": target_title,
832
- "target_topic": target_topic,
833
  "language": language,
834
- "instructions": "Create a smooth, natural paragraph that connects these topics"
835
  }
836
 
837
  try:
838
  obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
 
839
  return obj
840
- except:
841
- try:
842
- obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
843
- return obj
844
- except:
845
- return {"paragraph": sentence}
846
  """
847
  Final QA pass with language support.
848
  """
 
261
  rewritten = f'{base}{clause}{punct}'
262
  return rewritten, False
263
 
264
+ def find_alternative_anchor(blocks, target_url, original_anchor):
265
  """Find a better anchor text from the article that relates to the target URL."""
266
  try:
267
+ # Get target page context
268
+ try:
269
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
270
+ soup = BeautifulSoup(tgt_html, "html.parser")
271
+
272
+ # Extract target page title and meta description
273
+ title = soup.title.get_text().strip() if soup.title else ""
274
+ meta_desc = ""
275
+ meta_tag = soup.find("meta", attrs={"name": "description"})
276
+ if meta_tag:
277
+ meta_desc = meta_tag.get("content", "")
278
+
279
+ # Extract key terms from target page (first few paragraphs)
280
+ target_paragraphs = []
281
+ for p in soup.find_all("p")[:5]:
282
+ text = p.get_text().strip()
283
+ if len(text) > 50:
284
+ target_paragraphs.append(text)
285
+ target_content = " ".join(target_paragraphs[:3])
286
+
287
+ except Exception as e:
288
+ print(f"Error fetching target URL: {e}")
289
+ title = ""
290
+ meta_desc = ""
291
+ target_content = original_anchor
292
+
293
+ # Extract all potential anchor phrases from the source article
294
+ all_phrases = set()
295
  full_text = " ".join(blocks)
 
296
 
297
+ # Common words to exclude
298
+ stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
299
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
300
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
301
+ 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
302
+ 'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
303
 
304
+ # Extract noun phrases and important terms (2-4 words)
305
+ sentences = re.split(r'[.!?]', full_text)
306
  for sentence in sentences:
307
+ if not sentence:
308
  continue
 
 
309
  words = sentence.split()
310
 
311
+ # Extract phrases of 2-4 words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  for length in range(2, min(5, len(words) + 1)):
313
  for i in range(len(words) - length + 1):
314
+ if i < 0 or i+length > len(words):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  continue
316
+ phrase = ' '.join(words[i:i+length])
317
+ phrase_clean = phrase.strip('.,!?;:"\'')
318
 
319
+ # Check if phrase is meaningful
320
+ if i < len(words) and i+length-1 < len(words):
321
+ first_word = words[i].lower().strip('.,!?;:')
322
+ last_word = words[i+length-1].lower().strip('.,!?;:')
 
 
 
 
 
 
 
 
323
 
324
+ # Skip if starts/ends with stopwords or is too short
325
+ if (first_word not in stopwords and
326
+ last_word not in stopwords and
327
+ len(phrase_clean) > 5 and
328
+ len(phrase_clean) < 50):
329
+ all_phrases.add(phrase_clean)
330
+
331
+ # Also extract single important words (proper nouns, long words)
332
+ for word in words:
333
+ clean_word = word.strip('.,!?;:"\'')
334
+ if clean_word and (len(clean_word) > 6 or
335
+ (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
336
+ all_phrases.add(clean_word)
 
 
 
 
 
 
 
 
 
337
 
338
+ if not all_phrases:
339
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
340
 
341
+ # Create context query from target URL info
342
+ target_context = f"{title} {meta_desc} {target_content}"[:500]
 
 
343
 
344
+ # Score each phrase based on relevance to target
345
+ try:
346
+ target_emb = embed([target_context])[0]
347
+ except:
348
+ return None, None
349
+
350
+ best_anchor = None
351
+ best_score = -1
352
+ best_sentence = None
353
 
354
+ # Evaluate each potential anchor
355
+ for phrase in list(all_phrases)[:50]: # Limit to first 50 to avoid too much processing
356
+ # Skip if too similar to original anchor (we want something different)
357
+ if phrase.lower() == original_anchor.lower():
358
+ continue
359
+
360
+ try:
361
+ # Score this phrase against target context
362
+ phrase_emb = embed([phrase])[0]
363
+ relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
364
+
365
+ # Check if this phrase appears in article and find its best context
366
+ if phrase.lower() in full_text.lower():
367
+ # Find sentences containing this phrase
368
+ for block in blocks:
369
+ if phrase.lower() in block.lower():
370
+ sents = re.split(r'(?<=[.!?])\s+', block)
371
+ for sent in sents:
372
+ if sent and phrase.lower() in sent.lower():
373
+ # Score this sentence-phrase combination
374
+ try:
375
+ sent_emb = embed([sent])[0]
376
+ context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
377
+ combined_score = (relevance_score * 0.6) + (context_score * 0.4)
378
+
379
+ if combined_score > best_score:
380
+ best_score = combined_score
381
+ best_anchor = phrase
382
+ best_sentence = sent
383
+ except:
384
+ continue
385
+ except Exception as e:
386
+ print(f"Error evaluating phrase '{phrase}': {e}")
387
+ continue
388
 
389
+ return best_anchor, best_sentence
390
 
391
  except Exception as e:
392
  print(f"Critical error in find_alternative_anchor: {e}")
393
+ return None, None
 
 
394
 
395
  except Exception as e:
396
  print(f"Critical error in find_alternative_anchor: {e}")
 
642
  if suggest_alternative and not keyword_present:
643
  try:
644
  # Find a completely different anchor and sentence
645
+ alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
 
646
 
647
  if alt_anchor and alt_sentence:
648
  # Create the sentence with the alternative anchor
 
651
  result["alternative_sentence_original"] = alt_sentence
652
  result["alternative_sentence"] = alt_rewritten
653
  result["alternative_exact_match"] = alt_exact
 
654
  except Exception as e:
655
  print(f"Error finding alternative anchor: {e}")
656
  # Continue without alternative
 
769
  # Don't check for exact anchor text match as it might have special chars
770
  return {"sentence_html": out}
771
 
772
+ def gpt_get_target_keywords(target_url, target_context, language="English"):
773
+ """Ask GPT to suggest 5-10 relevant search keywords users would use to find this page."""
774
  if not OPENAI_API_KEY:
775
+ return []
776
 
777
  # Create cache key
778
+ cache_key = hashlib.md5(f"keywords_{target_url}{language}".encode()).hexdigest()
779
+
780
+ if cache_key in API_RESPONSE_CACHE:
781
+ print(f"[GPT] Using cached keywords for {target_url[:30]}...")
782
+ return API_RESPONSE_CACHE[cache_key].get("keywords", [])
783
 
784
+ title = target_context.get("title", "")
785
+ meta = target_context.get("meta_description", "")
786
+ content = target_context.get("main_content", "")[:500]
787
 
788
  system = (
789
+ "You are an SEO expert. Based on the page content provided, suggest 5-10 search keywords or phrases "
790
+ "that users would likely type into Google to find this page. "
791
+ "Include both short keywords (1-2 words) and long-tail keywords (3-5 words). "
792
+ "Make them realistic search terms, not just words from the page. "
793
+ f"Consider the {language} language and local search patterns. "
794
+ "Return JSON with a 'keywords' array."
795
+ )
796
+
797
+ user = {
798
+ "url": target_url,
799
+ "title": title,
800
+ "meta_description": meta,
801
+ "content_preview": content,
802
+ "task": "Generate search keywords users would use to find this page"
803
+ }
804
+
805
+ try:
806
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
807
+ keywords = obj.get("keywords", [])
808
+ print(f"\n[GPT] Target page keywords: {keywords}")
809
+ return keywords
810
+ except Exception as e:
811
+ print(f"[GPT] Error getting keywords: {e}")
812
+ return []
813
+
814
+ def gpt_add_keyword_to_content(blocks, keywords, target_url, language="English"):
815
+ """Ask GPT to naturally add one of the keywords to the content with proper context."""
816
+ if not OPENAI_API_KEY or not keywords:
817
+ return None
818
+
819
+ # Create cache key
820
+ blocks_preview = " ".join(blocks[:3])[:500]
821
+ cache_key = hashlib.md5(f"add_kw_{blocks_preview}{str(keywords)}{target_url}".encode()).hexdigest()
822
+
823
+ if cache_key in API_RESPONSE_CACHE:
824
+ return API_RESPONSE_CACHE[cache_key]
825
+
826
+ system = (
827
+ f"You are a skilled content editor writing in {language}. "
828
+ "Your task is to naturally integrate ONE of the provided keywords into the article content. "
829
  "RULES: "
830
+ "1. Choose the keyword that fits most naturally with the existing content "
831
+ "2. Add 2-3 sentences or a short paragraph that includes the keyword "
832
+ "3. Make it flow naturally - it should feel like it belongs there "
833
+ "4. Include an HTML link using the keyword as anchor text "
834
+ "5. Specify WHERE to add it (e.g., 'after the second paragraph', 'before the conclusion') "
835
+ "6. The addition should provide value, not just keyword stuffing "
836
+ f"7. Write in {language} and preserve special characters "
837
+ "Return JSON with: 'keyword_used', 'content_to_add', 'placement_instruction'"
838
  )
839
 
840
  user = {
841
+ "article_preview": " ".join(blocks[:5]),
842
+ "available_keywords": keywords,
 
843
  "target_url": target_url,
 
 
844
  "language": language,
845
+ "task": "Add one keyword naturally to the content"
846
  }
847
 
848
  try:
849
  obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
850
+ API_RESPONSE_CACHE[cache_key] = obj
851
  return obj
852
+ except Exception as e:
853
+ print(f"[GPT] Error adding keyword: {e}")
854
+ return None
 
 
 
855
  """
856
  Final QA pass with language support.
857
  """