Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 1

Commit

dc950b9

verified ·

1 Parent(s): 6b6a48c

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -509

app.py CHANGED Viewed

@@ -643,7 +643,7 @@ class EnhancedDipperHumanizer:
         return text
     def preserve_keywords(self, text, keywords):
-        """Mark keywords to preserve them during paraphrasing"""
         if not keywords:
             return text, {}
@@ -656,7 +656,7 @@ class EnhancedDipperHumanizer:
         for i, keyword in enumerate(sorted_keywords):
             # Use unique markers that won't be confused
-            placeholder = f"__KW{i:03d}__"  # e.g., __KW001__
             # Find all occurrences of the keyword (case-insensitive)
             pattern = r'\b' + re.escape(keyword) + r'\b'
@@ -669,378 +669,39 @@ class EnhancedDipperHumanizer:
                     start, end = match.span()
                     modified_text = modified_text[:start] + placeholder + modified_text[end:]
                     # Store the original case version
-                    keyword_map[placeholder] = original_keyword
         return modified_text, keyword_map
     def restore_keywords_robust(self, text, keyword_map):
-        """Restore keywords with more flexible pattern matching - ENHANCED VERSION"""
         if not keyword_map:
             return text
         restored_text = text
-        # Debug: print what we're working with
-        print(f"Restoring keywords in text: {restored_text[:100]}...")
-        print(f"Keyword map: {keyword_map}")
-        # Track which positions have been replaced to avoid double replacement
-        replaced_positions = set()
-        # First pass: Direct placeholder replacement
-        for placeholder, keyword in keyword_map.items():
-            if placeholder in restored_text:
-                print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
-                restored_text = restored_text.replace(placeholder, keyword)
-                # Mark positions as replaced
-                for match in re.finditer(re.escape(keyword), restored_text):
-                    replaced_positions.update(range(match.start(), match.end()))
-        # Second pass: Handle any mangled placeholders with EXPANDED patterns
-        for placeholder, keyword in keyword_map.items():
-            # Extract the number from placeholder
-            match = re.search(r'__KW(\d+)__', placeholder)
-            if match:
-                num = match.group(1)
-                # EXPANDED patterns the model might create
-                patterns = [
-                    # Original patterns
-                    (f'__KW{num}__', keyword),
-                    (f'__ KW{num}__', keyword),
-                    (f'__KW {num}__', keyword),
-                    (f'__ KW {num} __', keyword),
-                    (f'_KW{num}_', keyword),
-                    (f'_kw{num}_', keyword),
-                    (f'KW{num}', keyword),
-                    (f'KW {num}', keyword),
-                    (f'__kw{num}__', keyword),
-                    (f'__Kw{num}__', keyword),
-                    (f'__ kw{num}__', keyword),
-                    (f'__KW{num}_', keyword),
-                    (f'_KW{num}__', keyword),
-                    (f'kw{num}', keyword),
-                    (f'``KW{num}__', keyword),
-                    (f'``KKW{num}', keyword),
-                    # NEW patterns for common corruptions
-                    (f'KW{num}', keyword),
-                    (f'kw{num}', keyword),
-                    (f'Kw{num}', keyword),
-                    (f'K W{num}', keyword),
-                    (f'K w{num}', keyword),
-                    (f'k w{num}', keyword),
-                    # Patterns with punctuation corruption
-                    (f'__KW{num}__.', keyword),
-                    (f'__KW{num}__,', keyword),
-                    (f'__KW{num}__:', keyword),
-                    (f'__KW{num}__;', keyword),
-                    (f'.KW{num}', keyword),
-                    (f',KW{num}', keyword),
-                    (f':KW{num}', keyword),
-                    (f';KW{num}', keyword),
-                    (f'(KW{num})', keyword),
-                    (f'[KW{num}]', keyword),
-                    (f'"KW{num}"', keyword),
-                    (f"'KW{num}'", keyword),
-                    # Patterns with special characters
-                    (f'--KW{num}--', keyword),
-                    (f'==KW{num}==', keyword),
-                    (f'**KW{num}**', keyword),
-                    (f'##KW{num}##', keyword),
-                    (f'~~KW{num}~~', keyword),
-                    (f'//KW{num}//', keyword),
-                    (f'\\KW{num}\\', keyword),
-                    # Patterns with HTML entities
-                    (f'&lt;KW{num}&gt;', keyword),
-                    (f'&amp;KW{num}&amp;', keyword),
-                    (f'&#95;KW{num}&#95;', keyword),
-                    # Patterns with case variations
-                    (f'__kW{num}__', keyword),
-                    (f'__Kw{num}__', keyword),
-                    (f'__KW{num}__'.lower(), keyword),
-                    (f'__KW{num}__'.upper(), keyword),
-                    # Patterns with extra underscores
-                    (f'___KW{num}___', keyword),
-                    (f'____KW{num}____', keyword),
-                    (f'_____KW{num}_____', keyword),
-                    (f'__KW{num}___', keyword),
-                    (f'___KW{num}__', keyword),
-                    # Patterns with missing underscores
-                    (f'_KW{num}', keyword),
-                    (f'KW{num}_', keyword),
-                    (f'__KW{num}', keyword),
-                    (f'KW{num}__', keyword),
-                    # Patterns with dots instead of underscores
-                    (f'..KW{num}..', keyword),
-                    (f'.KW{num}.', keyword),
-                    (f'...KW{num}...', keyword),
-                    # Patterns with hyphens
-                    (f'-KW{num}-', keyword),
-                    (f'--KW{num}', keyword),
-                    (f'KW{num}--', keyword),
-                    # Patterns with spaces in the number
-                    (f'__KW {num}__', keyword),
-                    (f'__KW  {num}__', keyword),
-                    (f'__KW   {num}__', keyword),
-                    # Patterns with partial corruption
-                    (f'__{num}__', keyword),
-                    (f'__K{num}__', keyword),
-                    (f'__W{num}__', keyword),
-                    (f'__KW{num}', keyword),
-                    (f'KW{num}__', keyword),
-                    # Patterns with word boundaries
-                    (f'\\bKW{num}\\b', keyword),
-                    (f'\\b__KW{num}__\\b', keyword),
-                    # Patterns with newlines or tabs
-                    (f'\\nKW{num}\\n', keyword),
-                    (f'\\tKW{num}\\t', keyword),
-                    (f'\\rKW{num}\\r', keyword),
-                    # Patterns with common prefixes/suffixes
-                    (f'theKW{num}', keyword),
-                    (f'KW{num}the', keyword),
-                    (f'aKW{num}', keyword),
-                    (f'KW{num}a', keyword),
-                    (f'andKW{num}', keyword),
-                    (f'KW{num}and', keyword),
-                    (f'ofKW{num}', keyword),
-                    (f'KW{num}of', keyword),
-                    # Patterns with concatenation
-                    (f'KW{num}KW{num}', keyword),
-                    (f'KWKW{num}', keyword),
-                    (f'KW{num}{num}', keyword),
-                    # Patterns with zero-padding variations
-                    (f'__KW{num.zfill(3)}__', keyword),
-                    (f'__KW{num.zfill(4)}__', keyword),
-                    (f'__KW{num.lstrip("0")}__', keyword),
-                    # Patterns with brackets and braces
-                    (f'{{KW{num}}}', keyword),
-                    (f'<KW{num}>', keyword),
-                    (f'</KW{num}>', keyword),
-                    (f'<KW{num}/>', keyword),
-                    # Patterns with quotes variations
-                    (f'`KW{num}`', keyword),
-                    (f'```KW{num}```', keyword),
-                    (f"'''KW{num}'''", keyword),
-                    (f'"""KW{num}"""', keyword),
-                    # Patterns with markdown-style formatting
-                    (f'*KW{num}*', keyword),
-                    (f'_KW{num}_', keyword),
-                    (f'**KW{num}**', keyword),
-                    (f'__KW{num}__', keyword),
-                    (f'***KW{num}***', keyword),
-                    (f'___KW{num}___', keyword),
-                    # Patterns with common typos
-                    (f'__WK{num}__', keyword),
-                    (f'__KV{num}__', keyword),
-                    (f'__KQ{num}__', keyword),
-                    (f'__JW{num}__', keyword),
-                    (f'__LW{num}__', keyword),
-                    (f'__KE{num}__', keyword),
-                    (f'__KR{num}__', keyword),
-                    # Patterns with inserted characters
-                    (f'__K-W{num}__', keyword),
-                    (f'__K_W{num}__', keyword),
-                    (f'__K.W{num}__', keyword),
-                    (f'__K W{num}__', keyword),
-                    (f'__K/W{num}__', keyword),
-                    (f'__K\\W{num}__', keyword),
-                    # Patterns with duplicated parts
-                    (f'____KWKW{num}____', keyword),
-                    (f'__KWKW{num}__', keyword),
-                    (f'__KW{num}{num}__', keyword),
-                    (f'__KW{num}KW{num}__', keyword),
-                    # Patterns with reversed parts
-                    (f'__WK{num}__', keyword),
-                    (f'{num}KW__', keyword),
-                    (f'__{num}KW__', keyword),
-                    # Patterns with common OCR errors
-                    (f'__KVV{num}__', keyword),
-                    (f'__l<W{num}__', keyword),
-                    (f'__l(W{num}__', keyword),
-                    (f'__I<W{num}__', keyword),
-                    # Patterns with unicode variations
-                    (f'＿＿KW{num}＿＿', keyword),
-                    (f'__ＫＷ{num}__', keyword),
-                    (f'——KW{num}——', keyword),
-                    (f'‗‗KW{num}‗‗', keyword),
-                ]
-                # Apply patterns
-                for pattern, replacement in patterns:
-                    if pattern in restored_text:
-                        # Check if this position has already been replaced
-                        start_pos = restored_text.find(pattern)
-                        if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
-                            print(f"Found pattern '{pattern}', replacing with {replacement}")
-                            restored_text = restored_text.replace(pattern, replacement, 1)
-                            # Mark new positions as replaced
-                            for match in re.finditer(re.escape(replacement), restored_text):
-                                replaced_positions.update(range(match.start(), match.end()))
-                            break
-        # Third pass: Use regex patterns for more complex variations
-        for placeholder, keyword in keyword_map.items():
-            match = re.search(r'__KW(\d+)__', placeholder)
-            if match:
-                num = match.group(1)
-                # Complex regex patterns
-                regex_patterns = [
-                    # Patterns with variable underscores
-                    (r'_{1,5}KW' + num + r'_{1,5}', keyword),
-                    (r'_{0,5}KW' + num + r'_{0,5}', keyword),
-                    # Patterns with any characters between K and W
-                    (r'__K.{0,3}W' + num + r'__', keyword),
-                    # Patterns with spaces and underscores mixed
-                    (r'[\s_]{1,5}KW' + num + r'[\s_]{1,5}', keyword),
-                    # Patterns with case-insensitive matching
-                    (r'(?i)__kw' + num + r'__', keyword),
-                    (r'(?i)kw' + num, keyword),
-                    # Patterns with word boundaries
-                    (r'\b[_]*KW' + num + r'[_]*\b', keyword),
-                    # Patterns with optional characters
-                    (r'_?_?KW' + num + r'_?_?', keyword),
-                    # Patterns with common separators
-                    (r'[-_\.]{0,3}KW' + num + r'[-_\.]{0,3}', keyword),
-                    # Patterns with HTML entities mixed in
-                    (r'&[a-z]+;?KW' + num + r'&[a-z]+;?', keyword),
-                    # Patterns for seriously mangled text
-                    (r'.{0,3}' + num + r'.{0,3}', keyword),  # Just the number with some chars
-                    # Patterns for split placeholders
-                    (r'__\s*KW\s*' + num + r'\s*__', keyword),
-                    (r'_\s*_\s*K\s*W\s*' + num + r'\s*_\s*_', keyword),
-                ]
-                for pattern, replacement in regex_patterns:
-                    matches = list(re.finditer(pattern, restored_text))
-                    for match in matches:
-                        start, end = match.span()
-                        if not any(pos in replaced_positions for pos in range(start, end)):
-                            print(f"Found regex pattern '{pattern}' at position {start}-{end}, replacing with {replacement}")
-                            before = restored_text[:start]
-                            after = restored_text[end:]
-                            restored_text = before + replacement + after
-                            replaced_positions.update(range(start, start + len(replacement)))
-                            break
-        # Fourth pass: Smart underscore replacement
-        # Count underscores and keywords to make intelligent replacements
-        underscore_groups = list(re.finditer(r'_{2,}', restored_text))
-        remaining_keywords = [kw for kw in keyword_map.values() if kw not in restored_text]
-        if underscore_groups and remaining_keywords:
-            print(f"Found {len(underscore_groups)} underscore groups and {len(remaining_keywords)} unused keywords")
-            # Sort underscore groups by length (descending) to prioritize longer ones
-            underscore_groups.sort(key=lambda x: x.end() - x.start(), reverse=True)
-            for i, underscore_match in enumerate(underscore_groups):
-                if i < len(remaining_keywords):
-                    start, end = underscore_match.span()
-                    if not any(pos in replaced_positions for pos in range(start, end)):
-                        keyword = remaining_keywords[i]
-                        before = restored_text[:start]
-                        after = restored_text[end:]
-                        restored_text = before + keyword + after
-                        replaced_positions.update(range(start, start + len(keyword)))
-                        print(f"Replaced underscore group at {start}-{end} with keyword: {keyword}")
-        # Fifth pass: Context-aware replacement
-        # Look for patterns where keywords might make sense
         for placeholder, keyword in keyword_map.items():
-            if keyword not in restored_text:
-                # Look for sentences or phrases that seem to be missing the keyword
-                # Common patterns where keywords might be missing
-                context_patterns = [
-                    r'the\s+(?:is|are|was|were)\s+',  # "the ___ is"
-                    r'of\s+(?:the\s+)?',  # "of the ___"
-                    r'for\s+(?:the\s+)?',  # "for the ___"
-                    r'in\s+(?:the\s+)?',  # "in the ___"
-                    r'with\s+(?:the\s+)?',  # "with the ___"
-                    r'about\s+(?:the\s+)?',  # "about the ___"
-                    r'using\s+(?:the\s+)?',  # "using the ___"
-                    r'through\s+(?:the\s+)?',  # "through the ___"
-                ]
-                for pattern in context_patterns:
-                    matches = list(re.finditer(pattern + r'([A-Z]{2,}\d+|\b\w{1,3}\b)', restored_text))
-                    for match in matches:
-                        suspicious_word = match.group(1)
-                        # Check if this looks like a mangled placeholder
-                        if re.match(r'^[A-Z]{1,3}\d+$', suspicious_word) or len(suspicious_word) <= 3:
-                            start = match.start(1)
-                            end = match.end(1)
-                            if not any(pos in replaced_positions for pos in range(start, end)):
-                                before = restored_text[:start]
-                                after = restored_text[end:]
-                                restored_text = before + keyword + after
-                                replaced_positions.update(range(start, start + len(keyword)))
-                                print(f"Context-aware replacement: replaced '{suspicious_word}' with '{keyword}'")
-                                break
-        # Final cleanup passes
-        # Remove any remaining placeholder artifacts
-        cleanup_patterns = [
-            (r'``+', ''),  # Remove backticks
-            (r'__+', ' '),  # Replace multiple underscores with space
-            (r'--+', '-'),  # Normalize dashes
-            (r'\s{2,}', ' '),  # Normalize spaces
-            (r'([.,!?])\s*\1+', r'\1'),  # Remove duplicate punctuation
-        ]
-        for pattern, replacement in cleanup_patterns:
-            restored_text = re.sub(pattern, replacement, restored_text)
-        # Ensure proper spacing around keywords
-        for keyword in keyword_map.values():
-            if keyword in restored_text:
-                # Fix spacing issues around the keyword
-                restored_text = re.sub(r'(\w)(' + re.escape(keyword) + r')', r'\1 \2', restored_text)
-                restored_text = re.sub(r'(' + re.escape(keyword) + r')(\w)', r'\1 \2', restored_text)
-                # Remove duplicate spaces
-                restored_text = re.sub(r'\s+', ' ', restored_text)
-        # Final verification
-        for placeholder, keyword in keyword_map.items():
-            if keyword not in restored_text:
-                print(f"WARNING: Keyword '{keyword}' still missing from final text!")
-        # Log final result
-        print(f"Final restored text: {restored_text[:100]}...")
         return restored_text.strip()
@@ -1235,7 +896,7 @@ class EnhancedDipperHumanizer:
         # Remove leading non-letter characters carefully
         # IMPORTANT: Preserve keyword placeholders
-        if not re.match(r'^(__KW\d+__|KW\d+)', text):
             # Only remove if it doesn't start with a placeholder
             text = re.sub(r'^[^a-zA-Z_]+', '', text)
@@ -1253,11 +914,6 @@ class EnhancedDipperHumanizer:
         # Preserve keywords
         text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
-        # Add debug logging
-        if keyword_map:
-            print(f"Debug: Created keyword map: {keyword_map}")
-            print(f"Debug: Text with placeholders: {text_with_placeholders[:100]}...")
         # Split into sentences for better control
         sentences = self.split_into_sentences_advanced(text_with_placeholders)
         paraphrased_sentences = []
@@ -1361,25 +1017,9 @@ class EnhancedDipperHumanizer:
         # Join sentences back
         result = ' '.join(paraphrased_sentences)
-        # Debug before restoration
-        if keyword_map:
-            print(f"Debug: Result before restoration: {result[:100]}...")
-            print(f"Debug: Checking for placeholders...")
-            for placeholder in keyword_map.keys():
-                if placeholder in result:
-                    print(f"Debug: Found placeholder {placeholder} in result")
-                else:
-                    # Check for mangled versions
-                    if '___' in result:
-                        print(f"Debug: Found underscores ___ instead of {placeholder}")
         # Restore keywords AFTER joining all sentences
         result = self.restore_keywords_robust(result, keyword_map)
-        # Debug after restoration
-        if keyword_map:
-            print(f"Debug: Result after restoration: {result[:100]}...")
         # Apply natural human patterns
         result = self.add_natural_human_patterns(result)
@@ -1450,7 +1090,7 @@ class EnhancedDipperHumanizer:
         # Ensure first letter is capitalized ONLY if it's sentence start
         # Don't capitalize words like "iPhone" or "eBay" or placeholders
-        if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('__KW'):
             generated = generated[0].upper() + generated[1:]
         return generated
@@ -1640,8 +1280,7 @@ class EnhancedDipperHumanizer:
                 # Check if it's not an acronym or proper noun that should stay lowercase
                 if (first_word[0].islower() and
                     not self.is_likely_acronym_or_proper_noun(first_word) and
-                    not first_word.startswith('__KW') and
-                    not first_word.startswith('_kw')):
                     # Only capitalize if it's a regular word
                     sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
@@ -1788,100 +1427,56 @@ class EnhancedDipperHumanizer:
         return html_text
-    def wrap_keywords_in_paragraphs(self, soup, keywords):
-        """Wrap keywords with <strong> tags inside <p> tags only"""
         if not keywords:
-            return
-        # Find all paragraph tags
-        for p_tag in soup.find_all('p'):
-            # Skip paragraphs that are inside special elements
-            # Check if paragraph is inside any of these elements
-            skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
-                          'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-                          'div.quiz-container', 'div.question-container', 'div.results']
-            # Check if this paragraph should be skipped
-            should_skip = False
-            for parent in p_tag.parents:
-                # Check by class
-                if parent.name == 'div' and parent.get('class'):
-                    classes = parent.get('class', [])
-                    if isinstance(classes, list):
-                        class_str = ' '.join(str(cls) for cls in classes)
-                    else:
-                        class_str = str(classes)
-                    if any(skip_class in class_str for skip_class in
-                          ['author-intro', 'cta-box', 'testimonial-card', 'news-box',
-                           'quiz-container', 'question-container', 'results', 'stats-grid',
-                           'toc-', 'comparison-tables']):
-                        should_skip = True
-                        break
-                # Check by tag name
-                if parent.name in ['button', 'a', 'blockquote', 'details', 'summary']:
-                    should_skip = True
-                    break
-            if should_skip:
-                continue
-            # Additional check: Skip if paragraph has specific classes
-            p_classes = p_tag.get('class', [])
-            if isinstance(p_classes, list):
-                p_class_str = ' '.join(str(cls) for cls in p_classes)
-            else:
-                p_class_str = str(p_classes)
-            if any(skip_class in p_class_str for skip_class in ['testimonial-card', 'quiz-', 'stat-']):
-                continue
-            # Process only if this is a regular content paragraph
-            # Get all text nodes in this paragraph
-            for text_node in p_tag.find_all(string=True):
-                # Skip if already inside a strong or b tag
-                if text_node.parent.name in ['strong', 'b', 'em', 'i', 'span', 'a']:
                     continue
-                # Skip if the text node's immediate parent isn't the p tag
-                # (to avoid nested elements)
-                if text_node.parent != p_tag:
-                    continue
-                original_text = str(text_node)
-                # Skip very short text nodes
-                if len(original_text.strip()) < 20:
-                    continue
-                modified_text = original_text
-                # Check each keyword
-                for keyword in keywords:
-                    # Use word boundaries for accurate matching
-                    pattern = r'\b' + re.escape(keyword) + r'\b'
-                    # Find all matches (case-insensitive)
-                    matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
-                    # Replace from end to beginning to maintain positions
-                    for match in reversed(matches):
-                        start, end = match.span()
-                        matched_text = match.group(0)
-                        # Wrap with strong tag
-                        modified_text = (modified_text[:start] +
-                                       f'<strong>{matched_text}</strong>' +
-                                       modified_text[end:])
-                # If text was modified, replace the text node
-                if modified_text != original_text:
-                    # Parse the modified text to create new nodes
-                    new_soup = BeautifulSoup(modified_text, 'html.parser')
-                    # Replace the text node with the new nodes
-                    for new_node in reversed(new_soup.contents):
-                        text_node.insert_after(new_node)
-                    text_node.extract()
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
@@ -2020,26 +1615,12 @@ class EnhancedDipperHumanizer:
                 if len(original_text.split()) < 3:
                     continue
-                # Debug: Check if keywords are in this text
-                text_has_keywords = any(keyword.lower() in original_text.lower() for keyword in all_keywords)
-                if text_has_keywords:
-                    print(f"Debug: Processing text with keywords: {original_text[:50]}...")
                 # First pass with Dipper (with adjusted diversity)
                 paraphrased_text = self.paraphrase_with_dipper(
                     original_text,
                     keywords=all_keywords
                 )
-                # Verify no placeholders remain
-                if '__KW' in paraphrased_text or '___' in paraphrased_text:
-                    print(f"Warning: Placeholder or underscores found in paraphrased text: {paraphrased_text[:100]}...")
-                    # Try to restore again with the enhanced function
-                    temp_map = {}
-                    for j, keyword in enumerate(all_keywords):
-                        temp_map[f'__KW{j:03d}__'] = keyword
-                    paraphrased_text = self.restore_keywords_robust(paraphrased_text, temp_map)
                 # Second pass with BART for longer texts (increased probability)
                 if self.use_bart and len(paraphrased_text.split()) > 8:
                     # 50% chance to use BART for more variation (reduced from 60%)
@@ -2058,12 +1639,6 @@ class EnhancedDipperHumanizer:
                 # Fix punctuation and formatting
                 paraphrased_text = self.fix_punctuation(paraphrased_text)
-                # Final check for any remaining placeholders or underscores
-                if '___' in paraphrased_text or '__KW' in paraphrased_text:
-                    print(f"Error: Unresolved placeholders in final text")
-                    # Use original text if we can't resolve placeholders
-                    paraphrased_text = original_text
                 # Final quality check
                 if paraphrased_text and len(paraphrased_text.split()) >= 3:
                     element_info['element'].replace_with(NavigableString(paraphrased_text))
@@ -2077,20 +1652,8 @@ class EnhancedDipperHumanizer:
                     progress = (i + 1) / total_elements * 100
                     print(f"Progress: {progress:.1f}%")
-            # Wrap keywords with <strong> tags in paragraphs
-            self.wrap_keywords_in_paragraphs(soup, all_keywords)
-            # Post-process the entire HTML to fix bold/strong formatting
             result = str(soup)
-            result = self.post_process_html(result)
-            # Final safety check for any remaining placeholders or underscores
-            if '__KW' in result or re.search(r'_{3,}', result):
-                print("Warning: Found placeholders or multiple underscores in final HTML output")
-                # Attempt to clean them with keywords
-                for i, keyword in enumerate(all_keywords):
-                    result = result.replace(f'__KW{i:03d}__', keyword)
-                    result = re.sub(r'_{3,}', keyword, result, count=1)
             # Restore all script tags
             for idx, script_content in enumerate(preserved_scripts):
@@ -2102,6 +1665,12 @@ class EnhancedDipperHumanizer:
                 placeholder = style_placeholder.format(idx)
                 result = result.replace(placeholder, style_content)
             # Validate and fix HTML syntax
             result = self.validate_and_fix_html(result)
@@ -2293,7 +1862,7 @@ iface = gr.Interface(
     - Stream-of-consciousness elements and rhetorical questions
     - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
     - Fixed placeholder system that preserves keywords
-    - Keywords inside <p> tags are automatically wrapped with <strong> tags
     - Skips content in <strong>, <b>, and heading tags (including inside tables)
     - Designed to pass the strictest AI detection systems

         return text
     def preserve_keywords(self, text, keywords):
+        """Mark keywords to preserve them during paraphrasing - SIMPLIFIED"""
         if not keywords:
             return text, {}
         for i, keyword in enumerate(sorted_keywords):
             # Use unique markers that won't be confused
+            placeholder = f"KWPH{i:04d}"  # e.g., KWPH0001
             # Find all occurrences of the keyword (case-insensitive)
             pattern = r'\b' + re.escape(keyword) + r'\b'
                     start, end = match.span()
                     modified_text = modified_text[:start] + placeholder + modified_text[end:]
                     # Store the original case version
+                    if placeholder not in keyword_map:
+                        keyword_map[placeholder] = original_keyword
         return modified_text, keyword_map
     def restore_keywords_robust(self, text, keyword_map):
+        """Restore keywords with simple direct replacement"""
         if not keyword_map:
             return text
         restored_text = text
+        # Simple direct replacement
         for placeholder, keyword in keyword_map.items():
+            # Direct replacement
+            restored_text = restored_text.replace(placeholder, keyword)
+            # Also try with potential variations that might occur
+            restored_text = restored_text.replace(f" {placeholder} ", f" {keyword} ")
+            restored_text = restored_text.replace(f"{placeholder}.", f"{keyword}.")
+            restored_text = restored_text.replace(f"{placeholder},", f"{keyword},")
+            restored_text = restored_text.replace(f"{placeholder}!", f"{keyword}!")
+            restored_text = restored_text.replace(f"{placeholder}?", f"{keyword}?")
+            restored_text = restored_text.replace(f"{placeholder}:", f"{keyword}:")
+            restored_text = restored_text.replace(f"{placeholder};", f"{keyword};")
+            restored_text = restored_text.replace(f"({placeholder})", f"({keyword})")
+            restored_text = restored_text.replace(f'"{placeholder}"', f'"{keyword}"')
+            restored_text = restored_text.replace(f"'{placeholder}'", f"'{keyword}'")
+            # Handle case variations
+            restored_text = restored_text.replace(placeholder.lower(), keyword)
+            restored_text = restored_text.replace(placeholder.upper(), keyword)
+            restored_text = restored_text.replace(placeholder.capitalize(), keyword)
         return restored_text.strip()
         # Remove leading non-letter characters carefully
         # IMPORTANT: Preserve keyword placeholders
+        if not re.match(r'^(KWPH\d+)', text):
             # Only remove if it doesn't start with a placeholder
             text = re.sub(r'^[^a-zA-Z_]+', '', text)
         # Preserve keywords
         text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
         # Split into sentences for better control
         sentences = self.split_into_sentences_advanced(text_with_placeholders)
         paraphrased_sentences = []
         # Join sentences back
         result = ' '.join(paraphrased_sentences)
         # Restore keywords AFTER joining all sentences
         result = self.restore_keywords_robust(result, keyword_map)
         # Apply natural human patterns
         result = self.add_natural_human_patterns(result)
         # Ensure first letter is capitalized ONLY if it's sentence start
         # Don't capitalize words like "iPhone" or "eBay" or placeholders
+        if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('KWPH'):
             generated = generated[0].upper() + generated[1:]
         return generated
                 # Check if it's not an acronym or proper noun that should stay lowercase
                 if (first_word[0].islower() and
                     not self.is_likely_acronym_or_proper_noun(first_word) and
+                    not first_word.startswith('KWPH')):
                     # Only capitalize if it's a regular word
                     sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
         return html_text
+    def wrap_keywords_in_bold(self, html_content, keywords):
+        """Wrap all keyword occurrences with <strong> tags - FIXED VERSION"""
         if not keywords:
+            return html_content
+        # Parse the HTML
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Process each keyword
+        for keyword in keywords:
+            # Find all text nodes that contain this keyword
+            for element in soup.find_all(string=re.compile(re.escape(keyword), re.IGNORECASE)):
+                # Skip if already inside certain tags
+                parent = element.parent
+                if parent and parent.name in ['script', 'style', 'strong', 'b', 'a', 'button',
+                                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
                     continue
+                # Get the text content
+                text = str(element)
+                # Replace all occurrences of the keyword with <strong> wrapped version
+                # Use a regex to preserve the original case
+                pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
+                # Find all matches
+                matches = list(pattern.finditer(text))
+                if matches:
+                    # Build the new text with <strong> tags
+                    new_text = ""
+                    last_end = 0
+                    for match in matches:
+                        # Add text before the match
+                        new_text += text[last_end:match.start()]
+                        # Add the wrapped keyword (preserving original case)
+                        new_text += f"<strong>{match.group(0)}</strong>"
+                        last_end = match.end()
+                    # Add remaining text
+                    new_text += text[last_end:]
+                    # Replace the text node with new HTML
+                    new_soup = BeautifulSoup(new_text, 'html.parser')
+                    for new_element in reversed(list(new_soup.contents)):
+                        element.insert_after(new_element)
+                    element.extract()
+        return str(soup)
     def add_natural_flow_variations(self, text):
         """Add more natural flow and rhythm variations for Originality AI"""
                 if len(original_text.split()) < 3:
                     continue
                 # First pass with Dipper (with adjusted diversity)
                 paraphrased_text = self.paraphrase_with_dipper(
                     original_text,
                     keywords=all_keywords
                 )
                 # Second pass with BART for longer texts (increased probability)
                 if self.use_bart and len(paraphrased_text.split()) > 8:
                     # 50% chance to use BART for more variation (reduced from 60%)
                 # Fix punctuation and formatting
                 paraphrased_text = self.fix_punctuation(paraphrased_text)
                 # Final quality check
                 if paraphrased_text and len(paraphrased_text.split()) >= 3:
                     element_info['element'].replace_with(NavigableString(paraphrased_text))
                     progress = (i + 1) / total_elements * 100
                     print(f"Progress: {progress:.1f}%")
+            # Get the processed HTML
             result = str(soup)
             # Restore all script tags
             for idx, script_content in enumerate(preserved_scripts):
                 placeholder = style_placeholder.format(idx)
                 result = result.replace(placeholder, style_content)
+            # NOW wrap keywords in bold tags after all processing is complete
+            result = self.wrap_keywords_in_bold(result, all_keywords)
+            # Post-process the entire HTML to fix bold/strong formatting
+            result = self.post_process_html(result)
             # Validate and fix HTML syntax
             result = self.validate_and_fix_html(result)
     - Stream-of-consciousness elements and rhetorical questions
     - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
     - Fixed placeholder system that preserves keywords
+    - Keywords are automatically wrapped with <strong> tags
     - Skips content in <strong>, <b>, and heading tags (including inside tables)
     - Designed to pass the strictest AI detection systems