dusan-presswhizz commited on
Commit
8b2a25c
·
verified ·
1 Parent(s): 616a3cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -100
app.py CHANGED
@@ -286,52 +286,48 @@ def find_alternative_anchor(blocks, target_url, original_anchor, target_context=
286
  clean = word.strip('.,!?;:"\'()[]{}')
287
  if len(clean) > 3 and clean.isalpha():
288
  important_words.append(clean)
289
-
290
- # Find multi-word phrases in target that might be important
291
- target_text_combined = f"{target_title} {target_meta} {target_headings}".lower()
292
-
293
- # Look for domain-specific terms (cleaning, hotel, service, luxury, etc.)
294
- domain_indicators = ['hotel', 'cleaning', 'service', 'luxury', 'housekeeping',
295
- 'maintenance', 'staff', 'room', 'suite', 'amenities',
296
- 'hospitality', 'facility', 'hygiene', 'sanitation',
297
- 'laundry', 'janitorial', 'professional', 'quality']
298
-
299
- for indicator in domain_indicators:
300
- if indicator in target_text_combined:
301
- target_keywords.add(indicator)
302
 
303
  print(f"\nTarget page keywords detected: {list(target_keywords)[:10]}")
304
 
305
- # Now search for MEANINGFUL phrases in source article that relate to these concepts
306
  full_text = " ".join(blocks)
307
  sentences = re.split(r'[.!?]', full_text)
308
 
309
- candidate_anchors = {} # phrase -> (sentence, score)
310
 
311
  for sentence in sentences:
312
  if not sentence or len(sentence.strip()) < 20:
313
  continue
314
 
315
  sentence_lower = sentence.lower()
316
-
317
- # Look for meaningful phrases (not random fragments)
318
  words = sentence.split()
319
 
320
- # Single important words (must be nouns/adjectives, not fragments)
 
321
  for word in words:
322
  clean_word = word.strip('.,!?;:"\'()[]{}')
323
- if (len(clean_word) > 4 and
324
- clean_word.isalpha() and
325
- clean_word[0].isupper()): # Likely a proper noun
326
-
327
- # Check if this word relates to our target keywords
328
- relevance = sum(1 for kw in target_keywords if kw in clean_word.lower() or clean_word.lower() in kw)
329
- if relevance > 0 or any(kw in clean_word.lower() for kw in ['hotel', 'resort', 'luxury', 'service']):
330
- if clean_word not in candidate_anchors or candidate_anchors[clean_word][1] < relevance:
331
- candidate_anchors[clean_word] = (sentence.strip(), relevance)
 
 
 
 
 
 
 
 
332
 
333
- # Look for 2-4 word MEANINGFUL phrases (not random fragments)
334
- for length in range(2, 5):
335
  for i in range(len(words) - length + 1):
336
  if i < 0 or i + length > len(words):
337
  continue
@@ -340,87 +336,88 @@ def find_alternative_anchor(blocks, target_url, original_anchor, target_context=
340
  phrase = ' '.join(phrase_words)
341
  phrase_clean = phrase.strip('.,!?;:"\'()')
342
 
343
- # Skip if it's a fragment (starts/ends with conjunction, article, preposition)
344
- skip_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
345
- 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
346
- 'were', 'be', 'have', 'has', 'had', 'do', 'does', 'did',
347
- 'will', 'would', 'could', 'should', 'may', 'might', 'must',
348
- 'shall', 'can', 'need', 'ought', 'used', 'if', 'then', 'than'}
349
 
350
  first_word = phrase_words[0].lower().strip('.,!?;:"\'')
351
  last_word = phrase_words[-1].lower().strip('.,!?;:"\'')
352
 
353
- # Must be a complete phrase, not a fragment
354
- if (first_word in skip_words or
355
- last_word in skip_words or
356
- len(phrase_clean) < 8 or
357
  len(phrase_clean) > 50 or
358
- not phrase_clean[0].isalpha() or
359
- phrase_clean.endswith("'s")): # Skip possessives
360
  continue
361
 
362
- # Check if phrase is actually meaningful (contains important words)
363
- phrase_lower = phrase_clean.lower()
364
-
365
- # Score based on relevance to target page
366
- relevance_score = 0
367
-
368
- # Direct keyword matches
369
- for kw in target_keywords:
370
- if kw in phrase_lower:
371
- relevance_score += 2
372
-
373
- # Semantic relevance to hotel/cleaning/service domain
374
- for indicator in ['hotel', 'luxury', 'service', 'room', 'suite', 'clean',
375
- 'staff', 'guest', 'resort', 'boutique', 'accommodation']:
376
- if indicator in phrase_lower:
377
- relevance_score += 1
378
 
379
- # Only consider if it has some relevance
380
- if relevance_score > 0:
381
- # Make sure it's a coherent phrase by checking with embeddings
382
- try:
383
- phrase_emb = embed([phrase_clean])[0]
384
- target_emb = embed([target_context.get("summary", "")[:500]])[0]
385
- semantic_score = F.cosine_similarity(
386
- phrase_emb.unsqueeze(0),
387
- target_emb.unsqueeze(0)
388
- ).item()
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
- # Combined score
391
- total_score = (relevance_score * 0.4) + (semantic_score * 0.6)
392
 
393
- # Only keep if it's good enough and better than existing
394
- if (semantic_score > 0.3 and total_score > 0.35 and
395
- (phrase_clean not in candidate_anchors or
396
- candidate_anchors[phrase_clean][1] < total_score)):
397
- candidate_anchors[phrase_clean] = (sentence.strip(), total_score)
398
- print(f" Candidate: '{phrase_clean}' (score: {total_score:.3f})")
399
- except:
400
- continue
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  # Select the best anchor from candidates
403
  if not candidate_anchors:
404
- print("\n✗ No suitable alternative anchor found")
405
- return None, None
406
 
407
  # Sort by score and get the best one
408
  sorted_candidates = sorted(candidate_anchors.items(), key=lambda x: x[1][1], reverse=True)
409
- best_anchor, (best_sentence, best_score) = sorted_candidates[0]
410
-
411
- # Final validation - make sure it's actually good
412
- if best_score < 0.35 or len(best_anchor) < 5:
413
- print(f"\n✗ Best candidate '{best_anchor}' not good enough (score: {best_score:.3f})")
414
- return None, None
415
 
416
  print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
417
- return best_anchor, best_sentence
 
 
 
418
 
419
  except Exception as e:
420
  print(f"Critical error in find_alternative_anchor: {e}")
421
  import traceback
422
  traceback.print_exc()
423
- return None, None
424
 
425
  except Exception as e:
426
  print(f"Critical error in find_alternative_anchor: {e}")
@@ -673,7 +670,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
673
  try:
674
  # Find a completely different anchor and sentence
675
  # Pass the target_context we already analyzed
676
- alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
677
 
678
  if alt_anchor and alt_sentence:
679
  # Create the sentence with the alternative anchor
@@ -682,6 +679,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
682
  result["alternative_sentence_original"] = alt_sentence
683
  result["alternative_sentence"] = alt_rewritten
684
  result["alternative_exact_match"] = alt_exact
 
685
  except Exception as e:
686
  print(f"Error finding alternative anchor: {e}")
687
  # Continue without alternative
@@ -800,7 +798,51 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
800
  # Don't check for exact anchor text match as it might have special chars
801
  return {"sentence_html": out}
802
 
803
- def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="English"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
  """
805
  Final QA pass with language support.
806
  """
@@ -957,30 +999,51 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
957
  alt_anchor = res["alternative_anchor"]
958
  alt_sentence_original = res.get("alternative_sentence_original", "")
959
  alt_sentence = res.get("alternative_sentence", "")
 
960
 
961
  # Detect language for alternative sentence
962
  if alt_sentence_original:
963
  alt_detected_lang = detect_language(alt_sentence_original)
964
  alt_language_name = get_language_name(alt_detected_lang)
965
 
966
- # Apply GPT rewriting to alternative as well
967
- if smart_rewrite and alt_sentence:
968
- alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
969
- alt_final = alt_g["sentence_html"]
 
 
 
 
 
 
 
 
 
 
 
970
  else:
971
- alt_final = alt_sentence
972
-
973
- # Polish if needed
974
- if not res.get("alternative_exact_match", False):
975
- alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
976
- alt_final = alt_polished.get("sentence_html", alt_final)
 
 
 
 
 
977
 
978
  alt_output = to_plain_text(alt_final) if plain_text else alt_final
979
 
980
  # Add alternative as Result 2
981
  result += f"\n\n{'='*50}\n\n"
982
  result += f"🔗 Result 2 - Alternative from article:\n"
983
- result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
 
 
 
 
984
  result += f"Original: {alt_sentence_original}\n\n"
985
  result += f"Suggested: {alt_output}"
986
 
 
286
  clean = word.strip('.,!?;:"\'()[]{}')
287
  if len(clean) > 3 and clean.isalpha():
288
  important_words.append(clean)
289
+ if len(clean) > 4: # Add to keywords
290
+ target_keywords.add(clean)
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  print(f"\nTarget page keywords detected: {list(target_keywords)[:10]}")
293
 
294
+ # Now search for phrases in source article
295
  full_text = " ".join(blocks)
296
  sentences = re.split(r'[.!?]', full_text)
297
 
298
+ candidate_anchors = {} # phrase -> (sentence, score, needs_bridge)
299
 
300
  for sentence in sentences:
301
  if not sentence or len(sentence.strip()) < 20:
302
  continue
303
 
304
  sentence_lower = sentence.lower()
 
 
305
  words = sentence.split()
306
 
307
+ # Look for ALL potential phrases, even loosely related ones
308
+ # Single important words
309
  for word in words:
310
  clean_word = word.strip('.,!?;:"\'()[]{}')
311
+ if (len(clean_word) > 4 and clean_word.isalpha()):
312
+ # Calculate relevance even for loose matches
313
+ try:
314
+ word_emb = embed([clean_word])[0]
315
+ target_emb = embed([target_context.get("summary", "")[:500]])[0]
316
+ semantic_score = F.cosine_similarity(
317
+ word_emb.unsqueeze(0),
318
+ target_emb.unsqueeze(0)
319
+ ).item()
320
+
321
+ # Lower threshold for considering candidates
322
+ if semantic_score > 0.15: # Much lower threshold
323
+ needs_bridge = semantic_score < 0.3 # Mark if needs bridge content
324
+ if clean_word not in candidate_anchors or candidate_anchors[clean_word][1] < semantic_score:
325
+ candidate_anchors[clean_word] = (sentence.strip(), semantic_score, needs_bridge)
326
+ except:
327
+ continue
328
 
329
+ # Look for 2-4 word phrases
330
+ for length in range(2, min(5, len(words) + 1)):
331
  for i in range(len(words) - length + 1):
332
  if i < 0 or i + length > len(words):
333
  continue
 
336
  phrase = ' '.join(phrase_words)
337
  phrase_clean = phrase.strip('.,!?;:"\'()')
338
 
339
+ # More lenient filtering
340
+ skip_words = {'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then', 'than'}
 
 
 
 
341
 
342
  first_word = phrase_words[0].lower().strip('.,!?;:"\'')
343
  last_word = phrase_words[-1].lower().strip('.,!?;:"\'')
344
 
345
+ # Allow more phrases through
346
+ if (len(phrase_clean) < 5 or
 
 
347
  len(phrase_clean) > 50 or
348
+ not phrase_clean[0].isalpha()):
 
349
  continue
350
 
351
+ # Skip only the worst fragments
352
+ if first_word in skip_words and last_word in skip_words:
353
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+ # Calculate relevance score
356
+ try:
357
+ phrase_emb = embed([phrase_clean])[0]
358
+ target_emb = embed([target_context.get("summary", "")[:500]])[0]
359
+ semantic_score = F.cosine_similarity(
360
+ phrase_emb.unsqueeze(0),
361
+ target_emb.unsqueeze(0)
362
+ ).item()
363
+
364
+ # Accept even loosely related phrases
365
+ if semantic_score > 0.15: # Much lower threshold
366
+ needs_bridge = semantic_score < 0.3 # Mark if needs bridge
367
+
368
+ # Check for topic-related words (beauty, skincare, nail, etc.)
369
+ bonus = 0
370
+ general_beauty_terms = ['beauty', 'skincare', 'cosmetic', 'product', 'treatment',
371
+ 'care', 'skin', 'nail', 'makeup', 'store', 'shop',
372
+ 'korean', 'k-beauty', 'routine', 'regimen']
373
+ for term in general_beauty_terms:
374
+ if term in phrase_clean.lower():
375
+ bonus = 0.1
376
+ break
377
 
378
+ total_score = semantic_score + bonus
 
379
 
380
+ if phrase_clean not in candidate_anchors or candidate_anchors[phrase_clean][1] < total_score:
381
+ candidate_anchors[phrase_clean] = (sentence.strip(), total_score, needs_bridge)
382
+ if total_score > 0.2: # Only print decent candidates
383
+ print(f" Candidate: '{phrase_clean}' (score: {total_score:.3f}, needs_bridge: {needs_bridge})")
384
+ except:
385
+ continue
386
+
387
+ # If no candidates at all, try to find ANY noun phrase in the article
388
+ if not candidate_anchors:
389
+ print("\nNo semantic matches found, looking for any noun phrases...")
390
+ for sentence in sentences[:10]: # Check first 10 sentences
391
+ words = sentence.split()
392
+ for word in words:
393
+ clean_word = word.strip('.,!?;:"\'()[]{}')
394
+ # Any proper noun or long word
395
+ if clean_word and len(clean_word) > 5 and clean_word[0].isupper():
396
+ candidate_anchors[clean_word] = (sentence.strip(), 0.1, True) # Low score, needs bridge
397
+ break
398
+ if candidate_anchors:
399
+ break
400
 
401
  # Select the best anchor from candidates
402
  if not candidate_anchors:
403
+ print("\n✗ No alternative anchor found at all")
404
+ return None, None, False
405
 
406
  # Sort by score and get the best one
407
  sorted_candidates = sorted(candidate_anchors.items(), key=lambda x: x[1][1], reverse=True)
408
+ best_anchor, (best_sentence, best_score, needs_bridge) = sorted_candidates[0]
 
 
 
 
 
409
 
410
  print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
411
+ if needs_bridge:
412
+ print(f" → Will need bridge paragraph to connect to target topic")
413
+
414
+ return best_anchor, best_sentence, needs_bridge
415
 
416
  except Exception as e:
417
  print(f"Critical error in find_alternative_anchor: {e}")
418
  import traceback
419
  traceback.print_exc()
420
+ return None, None, False
421
 
422
  except Exception as e:
423
  print(f"Critical error in find_alternative_anchor: {e}")
 
670
  try:
671
  # Find a completely different anchor and sentence
672
  # Pass the target_context we already analyzed
673
+ alt_anchor, alt_sentence, needs_bridge = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
674
 
675
  if alt_anchor and alt_sentence:
676
  # Create the sentence with the alternative anchor
 
679
  result["alternative_sentence_original"] = alt_sentence
680
  result["alternative_sentence"] = alt_rewritten
681
  result["alternative_exact_match"] = alt_exact
682
+ result["needs_bridge_paragraph"] = needs_bridge
683
  except Exception as e:
684
  print(f"Error finding alternative anchor: {e}")
685
  # Continue without alternative
 
798
  # Don't check for exact anchor text match as it might have special chars
799
  return {"sentence_html": out}
800
 
801
+ def gpt_create_bridge_paragraph(anchor_text, sentence, target_url, target_context, language="English"):
802
+ """Create a bridge paragraph that naturally connects loosely related topics."""
803
+ if not OPENAI_API_KEY:
804
+ return {"paragraph": sentence}
805
+
806
+ # Create cache key
807
+ cache_key = hashlib.md5(f"bridge_{anchor_text}{sentence}{target_url}{language}".encode()).hexdigest()
808
+
809
+ target_title = target_context.get("title", "")
810
+ target_topic = target_context.get("meta_description", "")
811
+
812
+ system = (
813
+ f"You are a skilled content writer writing in {language}. "
814
+ f"IMPORTANT: Preserve all special characters and diacritics from the {language} language. "
815
+ "Your task is to create a natural bridge paragraph that connects two loosely related topics. "
816
+ "The paragraph should flow naturally from the source topic to the target topic. "
817
+ "RULES: "
818
+ "(1) Start with the context from the source article "
819
+ "(2) Create a natural transition to the target topic "
820
+ "(3) Include the anchor link naturally "
821
+ "(4) Make it 2-3 sentences that feel organic, not forced "
822
+ "(5) Avoid obvious transitions like 'Speaking of...' or 'On a related note...' "
823
+ "Return JSON with key 'paragraph' containing the HTML with the link included."
824
+ )
825
+
826
+ user = {
827
+ "task": "create_bridge_paragraph",
828
+ "source_context": sentence,
829
+ "anchor_text": anchor_text,
830
+ "target_url": target_url,
831
+ "target_title": target_title,
832
+ "target_topic": target_topic,
833
+ "language": language,
834
+ "instructions": "Create a smooth, natural paragraph that connects these topics"
835
+ }
836
+
837
+ try:
838
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
839
+ return obj
840
+ except:
841
+ try:
842
+ obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
843
+ return obj
844
+ except:
845
+ return {"paragraph": sentence}
846
  """
847
  Final QA pass with language support.
848
  """
 
999
  alt_anchor = res["alternative_anchor"]
1000
  alt_sentence_original = res.get("alternative_sentence_original", "")
1001
  alt_sentence = res.get("alternative_sentence", "")
1002
+ needs_bridge = res.get("needs_bridge_paragraph", False)
1003
 
1004
  # Detect language for alternative sentence
1005
  if alt_sentence_original:
1006
  alt_detected_lang = detect_language(alt_sentence_original)
1007
  alt_language_name = get_language_name(alt_detected_lang)
1008
 
1009
+ # If needs bridge paragraph, create one
1010
+ if needs_bridge and smart_rewrite:
1011
+ # Get target context for bridge creation
1012
+ target_info = {
1013
+ "title": res.get("target_title", ""),
1014
+ "meta_description": res.get("target_topic", "")
1015
+ }
1016
+ bridge_result = gpt_create_bridge_paragraph(
1017
+ alt_anchor,
1018
+ alt_sentence_original,
1019
+ target_url,
1020
+ target_info,
1021
+ alt_language_name
1022
+ )
1023
+ alt_final = bridge_result.get("paragraph", alt_sentence)
1024
  else:
1025
+ # Apply normal GPT rewriting
1026
+ if smart_rewrite and alt_sentence:
1027
+ alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
1028
+ alt_final = alt_g["sentence_html"]
1029
+ else:
1030
+ alt_final = alt_sentence
1031
+
1032
+ # Polish if needed
1033
+ if not res.get("alternative_exact_match", False) and smart_rewrite:
1034
+ alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
1035
+ alt_final = alt_polished.get("sentence_html", alt_final)
1036
 
1037
  alt_output = to_plain_text(alt_final) if plain_text else alt_final
1038
 
1039
  # Add alternative as Result 2
1040
  result += f"\n\n{'='*50}\n\n"
1041
  result += f"🔗 Result 2 - Alternative from article:\n"
1042
+ result += f"💡 Alternative anchor: '{alt_anchor}'\n"
1043
+ if needs_bridge:
1044
+ result += f"🌉 Bridge paragraph created (topics were loosely related)\n\n"
1045
+ else:
1046
+ result += f"\n"
1047
  result += f"Original: {alt_sentence_original}\n\n"
1048
  result += f"Suggested: {alt_output}"
1049