Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Build error

App Files Files Community

dusan-presswhizz commited on Aug 24, 2025

Commit

8b2a25c

verified ·

1 Parent(s): 616a3cc

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -100

app.py CHANGED Viewed

@@ -286,52 +286,48 @@ def find_alternative_anchor(blocks, target_url, original_anchor, target_context=
                 clean = word.strip('.,!?;:"\'()[]{}')
                 if len(clean) > 3 and clean.isalpha():
                     important_words.append(clean)
-        # Find multi-word phrases in target that might be important
-        target_text_combined = f"{target_title} {target_meta} {target_headings}".lower()
-        # Look for domain-specific terms (cleaning, hotel, service, luxury, etc.)
-        domain_indicators = ['hotel', 'cleaning', 'service', 'luxury', 'housekeeping',
-                            'maintenance', 'staff', 'room', 'suite', 'amenities',
-                            'hospitality', 'facility', 'hygiene', 'sanitation',
-                            'laundry', 'janitorial', 'professional', 'quality']
-        for indicator in domain_indicators:
-            if indicator in target_text_combined:
-                target_keywords.add(indicator)
         print(f"\nTarget page keywords detected: {list(target_keywords)[:10]}")
-        # Now search for MEANINGFUL phrases in source article that relate to these concepts
         full_text = " ".join(blocks)
         sentences = re.split(r'[.!?]', full_text)
-        candidate_anchors = {}  # phrase -> (sentence, score)
         for sentence in sentences:
             if not sentence or len(sentence.strip()) < 20:
                 continue
             sentence_lower = sentence.lower()
-            # Look for meaningful phrases (not random fragments)
             words = sentence.split()
-            # Single important words (must be nouns/adjectives, not fragments)
             for word in words:
                 clean_word = word.strip('.,!?;:"\'()[]{}')
-                if (len(clean_word) > 4 and
-                    clean_word.isalpha() and
-                    clean_word[0].isupper()):  # Likely a proper noun
-                    # Check if this word relates to our target keywords
-                    relevance = sum(1 for kw in target_keywords if kw in clean_word.lower() or clean_word.lower() in kw)
-                    if relevance > 0 or any(kw in clean_word.lower() for kw in ['hotel', 'resort', 'luxury', 'service']):
-                        if clean_word not in candidate_anchors or candidate_anchors[clean_word][1] < relevance:
-                            candidate_anchors[clean_word] = (sentence.strip(), relevance)
-            # Look for 2-4 word MEANINGFUL phrases (not random fragments)
-            for length in range(2, 5):
                 for i in range(len(words) - length + 1):
                     if i < 0 or i + length > len(words):
                         continue
@@ -340,87 +336,88 @@ def find_alternative_anchor(blocks, target_url, original_anchor, target_context=
                     phrase = ' '.join(phrase_words)
                     phrase_clean = phrase.strip('.,!?;:"\'()')
-                    # Skip if it's a fragment (starts/ends with conjunction, article, preposition)
-                    skip_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
-                                 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
-                                 'were', 'be', 'have', 'has', 'had', 'do', 'does', 'did',
-                                 'will', 'would', 'could', 'should', 'may', 'might', 'must',
-                                 'shall', 'can', 'need', 'ought', 'used', 'if', 'then', 'than'}
                     first_word = phrase_words[0].lower().strip('.,!?;:"\'')
                     last_word = phrase_words[-1].lower().strip('.,!?;:"\'')
-                    # Must be a complete phrase, not a fragment
-                    if (first_word in skip_words or
-                        last_word in skip_words or
-                        len(phrase_clean) < 8 or
                         len(phrase_clean) > 50 or
-                        not phrase_clean[0].isalpha() or
-                        phrase_clean.endswith("'s")):  # Skip possessives
                         continue
-                    # Check if phrase is actually meaningful (contains important words)
-                    phrase_lower = phrase_clean.lower()
-                    # Score based on relevance to target page
-                    relevance_score = 0
-                    # Direct keyword matches
-                    for kw in target_keywords:
-                        if kw in phrase_lower:
-                            relevance_score += 2
-                    # Semantic relevance to hotel/cleaning/service domain
-                    for indicator in ['hotel', 'luxury', 'service', 'room', 'suite', 'clean',
-                                     'staff', 'guest', 'resort', 'boutique', 'accommodation']:
-                        if indicator in phrase_lower:
-                            relevance_score += 1
-                    # Only consider if it has some relevance
-                    if relevance_score > 0:
-                        # Make sure it's a coherent phrase by checking with embeddings
-                        try:
-                            phrase_emb = embed([phrase_clean])[0]
-                            target_emb = embed([target_context.get("summary", "")[:500]])[0]
-                            semantic_score = F.cosine_similarity(
-                                phrase_emb.unsqueeze(0),
-                                target_emb.unsqueeze(0)
-                            ).item()
-                            # Combined score
-                            total_score = (relevance_score * 0.4) + (semantic_score * 0.6)
-                            # Only keep if it's good enough and better than existing
-                            if (semantic_score > 0.3 and total_score > 0.35 and
-                                (phrase_clean not in candidate_anchors or
-                                 candidate_anchors[phrase_clean][1] < total_score)):
-                                candidate_anchors[phrase_clean] = (sentence.strip(), total_score)
-                                print(f"  Candidate: '{phrase_clean}' (score: {total_score:.3f})")
-                        except:
-                            continue
         # Select the best anchor from candidates
         if not candidate_anchors:
-            print("\n✗ No suitable alternative anchor found")
-            return None, None
         # Sort by score and get the best one
         sorted_candidates = sorted(candidate_anchors.items(), key=lambda x: x[1][1], reverse=True)
-        best_anchor, (best_sentence, best_score) = sorted_candidates[0]
-        # Final validation - make sure it's actually good
-        if best_score < 0.35 or len(best_anchor) < 5:
-            print(f"\n✗ Best candidate '{best_anchor}' not good enough (score: {best_score:.3f})")
-            return None, None
         print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
-        return best_anchor, best_sentence
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
         import traceback
         traceback.print_exc()
-        return None, None
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
@@ -673,7 +670,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                     try:
                         # Find a completely different anchor and sentence
                         # Pass the target_context we already analyzed
-                        alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
                         if alt_anchor and alt_sentence:
                             # Create the sentence with the alternative anchor
@@ -682,6 +679,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
                             result["alternative_sentence_original"] = alt_sentence
                             result["alternative_sentence"] = alt_rewritten
                             result["alternative_exact_match"] = alt_exact
                     except Exception as e:
                         print(f"Error finding alternative anchor: {e}")
                         # Continue without alternative
@@ -800,7 +798,51 @@ def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", languag
     # Don't check for exact anchor text match as it might have special chars
     return {"sentence_html": out}
-def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="English"):
     """
     Final QA pass with language support.
     """
@@ -957,30 +999,51 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
             alt_anchor = res["alternative_anchor"]
             alt_sentence_original = res.get("alternative_sentence_original", "")
             alt_sentence = res.get("alternative_sentence", "")
             # Detect language for alternative sentence
             if alt_sentence_original:
                 alt_detected_lang = detect_language(alt_sentence_original)
                 alt_language_name = get_language_name(alt_detected_lang)
-                # Apply GPT rewriting to alternative as well
-                if smart_rewrite and alt_sentence:
-                    alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
-                    alt_final = alt_g["sentence_html"]
                 else:
-                    alt_final = alt_sentence
-                # Polish if needed
-                if not res.get("alternative_exact_match", False):
-                    alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
-                    alt_final = alt_polished.get("sentence_html", alt_final)
                 alt_output = to_plain_text(alt_final) if plain_text else alt_final
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
                 result += f"🔗 Result 2 - Alternative from article:\n"
-                result += f"💡 Alternative anchor: '{alt_anchor}'\n\n"
                 result += f"Original: {alt_sentence_original}\n\n"
                 result += f"Suggested: {alt_output}"

                 clean = word.strip('.,!?;:"\'()[]{}')
                 if len(clean) > 3 and clean.isalpha():
                     important_words.append(clean)
+                    if len(clean) > 4:  # Add to keywords
+                        target_keywords.add(clean)
         print(f"\nTarget page keywords detected: {list(target_keywords)[:10]}")
+        # Now search for phrases in source article
         full_text = " ".join(blocks)
         sentences = re.split(r'[.!?]', full_text)
+        candidate_anchors = {}  # phrase -> (sentence, score, needs_bridge)
         for sentence in sentences:
             if not sentence or len(sentence.strip()) < 20:
                 continue
             sentence_lower = sentence.lower()
             words = sentence.split()
+            # Look for ALL potential phrases, even loosely related ones
+            # Single important words
             for word in words:
                 clean_word = word.strip('.,!?;:"\'()[]{}')
+                if (len(clean_word) > 4 and clean_word.isalpha()):
+                    # Calculate relevance even for loose matches
+                    try:
+                        word_emb = embed([clean_word])[0]
+                        target_emb = embed([target_context.get("summary", "")[:500]])[0]
+                        semantic_score = F.cosine_similarity(
+                            word_emb.unsqueeze(0),
+                            target_emb.unsqueeze(0)
+                        ).item()
+                        # Lower threshold for considering candidates
+                        if semantic_score > 0.15:  # Much lower threshold
+                            needs_bridge = semantic_score < 0.3  # Mark if needs bridge content
+                            if clean_word not in candidate_anchors or candidate_anchors[clean_word][1] < semantic_score:
+                                candidate_anchors[clean_word] = (sentence.strip(), semantic_score, needs_bridge)
+                    except:
+                        continue
+            # Look for 2-4 word phrases
+            for length in range(2, min(5, len(words) + 1)):
                 for i in range(len(words) - length + 1):
                     if i < 0 or i + length > len(words):
                         continue
                     phrase = ' '.join(phrase_words)
                     phrase_clean = phrase.strip('.,!?;:"\'()')
+                    # More lenient filtering
+                    skip_words = {'the', 'a', 'an', 'and', 'or', 'but', 'if', 'then', 'than'}
                     first_word = phrase_words[0].lower().strip('.,!?;:"\'')
                     last_word = phrase_words[-1].lower().strip('.,!?;:"\'')
+                    # Allow more phrases through
+                    if (len(phrase_clean) < 5 or
                         len(phrase_clean) > 50 or
+                        not phrase_clean[0].isalpha()):
                         continue
+                    # Skip only the worst fragments
+                    if first_word in skip_words and last_word in skip_words:
+                        continue
+                    # Calculate relevance score
+                    try:
+                        phrase_emb = embed([phrase_clean])[0]
+                        target_emb = embed([target_context.get("summary", "")[:500]])[0]
+                        semantic_score = F.cosine_similarity(
+                            phrase_emb.unsqueeze(0),
+                            target_emb.unsqueeze(0)
+                        ).item()
+                        # Accept even loosely related phrases
+                        if semantic_score > 0.15:  # Much lower threshold
+                            needs_bridge = semantic_score < 0.3  # Mark if needs bridge
+                            # Check for topic-related words (beauty, skincare, nail, etc.)
+                            bonus = 0
+                            general_beauty_terms = ['beauty', 'skincare', 'cosmetic', 'product', 'treatment',
+                                                  'care', 'skin', 'nail', 'makeup', 'store', 'shop',
+                                                  'korean', 'k-beauty', 'routine', 'regimen']
+                            for term in general_beauty_terms:
+                                if term in phrase_clean.lower():
+                                    bonus = 0.1
+                                    break
+                            total_score = semantic_score + bonus
+                            if phrase_clean not in candidate_anchors or candidate_anchors[phrase_clean][1] < total_score:
+                                candidate_anchors[phrase_clean] = (sentence.strip(), total_score, needs_bridge)
+                                if total_score > 0.2:  # Only print decent candidates
+                                    print(f"  Candidate: '{phrase_clean}' (score: {total_score:.3f}, needs_bridge: {needs_bridge})")
+                    except:
+                        continue
+        # If no candidates at all, try to find ANY noun phrase in the article
+        if not candidate_anchors:
+            print("\nNo semantic matches found, looking for any noun phrases...")
+            for sentence in sentences[:10]:  # Check first 10 sentences
+                words = sentence.split()
+                for word in words:
+                    clean_word = word.strip('.,!?;:"\'()[]{}')
+                    # Any proper noun or long word
+                    if clean_word and len(clean_word) > 5 and clean_word[0].isupper():
+                        candidate_anchors[clean_word] = (sentence.strip(), 0.1, True)  # Low score, needs bridge
+                        break
+                if candidate_anchors:
+                    break
         # Select the best anchor from candidates
         if not candidate_anchors:
+            print("\n✗ No alternative anchor found at all")
+            return None, None, False
         # Sort by score and get the best one
         sorted_candidates = sorted(candidate_anchors.items(), key=lambda x: x[1][1], reverse=True)
+        best_anchor, (best_sentence, best_score, needs_bridge) = sorted_candidates[0]
         print(f"\n✓ Best alternative anchor: '{best_anchor}' (relevance: {best_score:.3f})")
+        if needs_bridge:
+            print(f"  → Will need bridge paragraph to connect to target topic")
+        return best_anchor, best_sentence, needs_bridge
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
         import traceback
         traceback.print_exc()
+        return None, None, False
     except Exception as e:
         print(f"Critical error in find_alternative_anchor: {e}")
                     try:
                         # Find a completely different anchor and sentence
                         # Pass the target_context we already analyzed
+                        alt_anchor, alt_sentence, needs_bridge = find_alternative_anchor(blocks, target_url, anchor_text, target_context)
                         if alt_anchor and alt_sentence:
                             # Create the sentence with the alternative anchor
                             result["alternative_sentence_original"] = alt_sentence
                             result["alternative_sentence"] = alt_rewritten
                             result["alternative_exact_match"] = alt_exact
+                            result["needs_bridge_paragraph"] = needs_bridge
                     except Exception as e:
                         print(f"Error finding alternative anchor: {e}")
                         # Continue without alternative
     # Don't check for exact anchor text match as it might have special chars
     return {"sentence_html": out}
+def gpt_create_bridge_paragraph(anchor_text, sentence, target_url, target_context, language="English"):
+    """Create a bridge paragraph that naturally connects loosely related topics."""
+    if not OPENAI_API_KEY:
+        return {"paragraph": sentence}
+    # Create cache key
+    cache_key = hashlib.md5(f"bridge_{anchor_text}{sentence}{target_url}{language}".encode()).hexdigest()
+    target_title = target_context.get("title", "")
+    target_topic = target_context.get("meta_description", "")
+    system = (
+        f"You are a skilled content writer writing in {language}. "
+        f"IMPORTANT: Preserve all special characters and diacritics from the {language} language. "
+        "Your task is to create a natural bridge paragraph that connects two loosely related topics. "
+        "The paragraph should flow naturally from the source topic to the target topic. "
+        "RULES: "
+        "(1) Start with the context from the source article "
+        "(2) Create a natural transition to the target topic "
+        "(3) Include the anchor link naturally "
+        "(4) Make it 2-3 sentences that feel organic, not forced "
+        "(5) Avoid obvious transitions like 'Speaking of...' or 'On a related note...' "
+        "Return JSON with key 'paragraph' containing the HTML with the link included."
+    )
+    user = {
+        "task": "create_bridge_paragraph",
+        "source_context": sentence,
+        "anchor_text": anchor_text,
+        "target_url": target_url,
+        "target_title": target_title,
+        "target_topic": target_topic,
+        "language": language,
+        "instructions": "Create a smooth, natural paragraph that connects these topics"
+    }
+    try:
+        obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
+        return obj
+    except:
+        try:
+            obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
+            return obj
+        except:
+            return {"paragraph": sentence}
     """
     Final QA pass with language support.
     """
             alt_anchor = res["alternative_anchor"]
             alt_sentence_original = res.get("alternative_sentence_original", "")
             alt_sentence = res.get("alternative_sentence", "")
+            needs_bridge = res.get("needs_bridge_paragraph", False)
             # Detect language for alternative sentence
             if alt_sentence_original:
                 alt_detected_lang = detect_language(alt_sentence_original)
                 alt_language_name = get_language_name(alt_detected_lang)
+                # If needs bridge paragraph, create one
+                if needs_bridge and smart_rewrite:
+                    # Get target context for bridge creation
+                    target_info = {
+                        "title": res.get("target_title", ""),
+                        "meta_description": res.get("target_topic", "")
+                    }
+                    bridge_result = gpt_create_bridge_paragraph(
+                        alt_anchor,
+                        alt_sentence_original,
+                        target_url,
+                        target_info,
+                        alt_language_name
+                    )
+                    alt_final = bridge_result.get("paragraph", alt_sentence)
                 else:
+                    # Apply normal GPT rewriting
+                    if smart_rewrite and alt_sentence:
+                        alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
+                        alt_final = alt_g["sentence_html"]
+                    else:
+                        alt_final = alt_sentence
+                    # Polish if needed
+                    if not res.get("alternative_exact_match", False) and smart_rewrite:
+                        alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
+                        alt_final = alt_polished.get("sentence_html", alt_final)
                 alt_output = to_plain_text(alt_final) if plain_text else alt_final
                 # Add alternative as Result 2
                 result += f"\n\n{'='*50}\n\n"
                 result += f"🔗 Result 2 - Alternative from article:\n"
+                result += f"💡 Alternative anchor: '{alt_anchor}'\n"
+                if needs_bridge:
+                    result += f"🌉 Bridge paragraph created (topics were loosely related)\n\n"
+                else:
+                    result += f"\n"
                 result += f"Original: {alt_sentence_original}\n\n"
                 result += f"Suggested: {alt_output}"