EdysorEdutech commited on
Commit
6b6a48c
·
verified ·
1 Parent(s): 270759b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +378 -144
app.py CHANGED
@@ -674,7 +674,7 @@ class EnhancedDipperHumanizer:
674
  return modified_text, keyword_map
675
 
676
  def restore_keywords_robust(self, text, keyword_map):
677
- """Restore keywords with more flexible pattern matching - FIXED VERSION"""
678
  if not keyword_map:
679
  return text
680
 
@@ -687,134 +687,362 @@ class EnhancedDipperHumanizer:
687
  # Track which positions have been replaced to avoid double replacement
688
  replaced_positions = set()
689
 
690
- # First, create a reverse map for easier debugging
691
- reverse_map = {}
692
  for placeholder, keyword in keyword_map.items():
693
- # Extract number from placeholder
694
- match = re.search(r'__KW(\d+)__', placeholder)
695
- if match:
696
- num = match.group(1)
697
- reverse_map[num] = keyword
698
- reverse_map[f'KW{num}'] = keyword
699
- reverse_map[f'kw{num}'] = keyword
700
-
701
- # CRITICAL FIX: Replace ALL variations of KW patterns
702
- # Start with the most specific patterns first
703
- all_patterns = []
704
-
705
  for placeholder, keyword in keyword_map.items():
 
706
  match = re.search(r'__KW(\d+)__', placeholder)
707
  if match:
708
  num = match.group(1)
709
 
710
- # Add all possible variations with this number
711
- patterns_to_add = [
712
- # Complete patterns first
713
  (f'__KW{num}__', keyword),
714
- (f'__KW{num}', keyword),
715
- (f'KW{num}__', keyword),
716
- (f'__kw{num}__', keyword),
717
  (f'_KW{num}_', keyword),
718
- (f'_KW{num}', keyword),
719
- (f'KW{num}_', keyword),
 
 
 
 
 
 
 
 
 
720
 
721
- # Bare patterns
722
  (f'KW{num}', keyword),
723
  (f'kw{num}', keyword),
724
  (f'Kw{num}', keyword),
725
- (f'KW{num.lstrip("0")}', keyword), # Remove leading zeros
 
 
726
 
727
- # With punctuation
728
- (f'KW{num}.', keyword),
729
- (f'KW{num},', keyword),
730
- (f'KW{num}:', keyword),
731
- (f'KW{num};', keyword),
732
- (f'KW{num}!', keyword),
733
- (f'KW{num}?', keyword),
734
- (f'KW{num})', keyword),
735
- (f'(KW{num}', keyword),
 
 
 
 
736
 
737
- # Common corruptions
738
- (f'KW-{num}', keyword),
739
- (f'KW_{num}', keyword),
740
- (f'KW {num}', keyword),
741
- (f'K W{num}', keyword),
742
- (f'KVV{num}', keyword),
743
- (f'KKW{num}', keyword),
744
- (f'WK{num}', keyword),
 
 
 
 
 
745
 
746
- # Multiple underscores
 
 
 
 
 
 
747
  (f'___KW{num}___', keyword),
748
  (f'____KW{num}____', keyword),
 
749
  (f'__KW{num}___', keyword),
750
  (f'___KW{num}__', keyword),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  ]
752
 
753
- all_patterns.extend(patterns_to_add)
754
-
755
- # Sort patterns by length (longest first) to avoid partial replacements
756
- all_patterns.sort(key=lambda x: len(x[0]), reverse=True)
757
-
758
- # Apply all patterns
759
- for pattern, keyword in all_patterns:
760
- if pattern in restored_text:
761
- print(f"Found pattern '{pattern}', replacing with '{keyword}'")
762
- restored_text = restored_text.replace(pattern, keyword)
763
-
764
- # Special handling for isolated KW patterns
765
- # This catches cases like "KW-KW", "KW-s", etc.
766
- kw_pattern = r'\bKW(?:\d*)\b'
767
- matches = list(re.finditer(kw_pattern, restored_text))
768
-
769
- if matches and keyword_map:
770
- # Use the first keyword as a fallback for unmatched KW patterns
771
- fallback_keyword = list(keyword_map.values())[0]
772
- for match in reversed(matches): # Process from end to maintain positions
773
- kw_text = match.group(0)
774
- if kw_text not in ['KW' + k.split('KW')[1].split('__')[0] for k in keyword_map.keys()]:
775
- # This is an orphaned KW pattern
776
- start, end = match.span()
777
- print(f"Replacing orphaned '{kw_text}' with '{fallback_keyword}'")
778
- restored_text = restored_text[:start] + fallback_keyword + restored_text[end:]
779
-
780
- # Final cleanup: Remove any remaining multiple underscores
781
- restored_text = re.sub(r'_{2,}', ' ', restored_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
 
783
- # Remove any remaining KW patterns that weren't caught
784
- # This is a last resort to clean up any stragglers
785
- restored_text = re.sub(r'\bKW\d*\b', '', restored_text)
786
 
787
- # Clean up any double spaces created
788
- restored_text = re.sub(r'\s+', ' ', restored_text)
 
 
 
 
 
 
789
 
790
  # Final verification
791
- remaining_kw = re.findall(r'KW\d+', restored_text)
792
- if remaining_kw:
793
- print(f"WARNING: Remaining KW patterns found: {remaining_kw}")
794
-
795
- return restored_text.strip()
796
-
797
- def final_kw_cleanup(self, text, keywords):
798
- """Final cleanup to remove any remaining KW patterns"""
799
- if not keywords:
800
- return text
801
-
802
- # Find all remaining KW patterns
803
- kw_patterns = re.findall(r'\bKW\d*\b', text)
804
 
805
- if kw_patterns:
806
- print(f"Final cleanup: Found {len(kw_patterns)} remaining KW patterns")
807
-
808
- # Replace each KW pattern with an appropriate keyword
809
- for i, pattern in enumerate(kw_patterns):
810
- # Use keywords cyclically if there are more patterns than keywords
811
- keyword_index = i % len(keywords)
812
- replacement = keywords[keyword_index]
813
-
814
- # Replace the pattern
815
- text = re.sub(r'\b' + re.escape(pattern) + r'\b', replacement, text, count=1)
816
 
817
- return text
818
 
819
  def should_skip_element(self, element, text):
820
  """Determine if an element should be skipped from paraphrasing"""
@@ -1561,13 +1789,14 @@ class EnhancedDipperHumanizer:
1561
  return html_text
1562
 
1563
  def wrap_keywords_in_paragraphs(self, soup, keywords):
1564
- """Wrap keywords with <strong> tags inside <p> tags only - FIXED VERSION"""
1565
  if not keywords:
1566
  return
1567
 
1568
  # Find all paragraph tags
1569
  for p_tag in soup.find_all('p'):
1570
  # Skip paragraphs that are inside special elements
 
1571
  skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
1572
  'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
1573
  'div.quiz-container', 'div.question-container', 'div.results']
@@ -1575,6 +1804,7 @@ class EnhancedDipperHumanizer:
1575
  # Check if this paragraph should be skipped
1576
  should_skip = False
1577
  for parent in p_tag.parents:
 
1578
  if parent.name == 'div' and parent.get('class'):
1579
  classes = parent.get('class', [])
1580
  if isinstance(classes, list):
@@ -1589,6 +1819,7 @@ class EnhancedDipperHumanizer:
1589
  should_skip = True
1590
  break
1591
 
 
1592
  if parent.name in ['button', 'a', 'blockquote', 'details', 'summary']:
1593
  should_skip = True
1594
  break
@@ -1606,50 +1837,51 @@ class EnhancedDipperHumanizer:
1606
  if any(skip_class in p_class_str for skip_class in ['testimonial-card', 'quiz-', 'stat-']):
1607
  continue
1608
 
1609
- # NEW APPROACH: Process the entire paragraph's HTML at once
1610
- try:
1611
- # Get the paragraph's inner HTML as a string
1612
- p_html = str(p_tag.decode_contents())
 
 
 
 
 
 
 
 
 
 
 
 
 
1613
 
1614
- # Track if we made any changes
1615
- modified = False
1616
 
1617
- # Process each keyword
1618
  for keyword in keywords:
1619
- # Create pattern that won't match if already in a tag
1620
- # This regex ensures we don't wrap keywords that are already inside HTML tags
1621
- pattern = r'(?<!<[^>]*)(?<!>)\b(' + re.escape(keyword) + r')\b(?![^<]*>)'
1622
 
1623
- # Count matches before replacement
1624
- matches_before = len(re.findall(pattern, p_html, flags=re.IGNORECASE))
1625
 
1626
- if matches_before > 0:
1627
- # Replace with strong tags, preserving original case
1628
- p_html = re.sub(
1629
- pattern,
1630
- r'<strong>\1</strong>',
1631
- p_html,
1632
- flags=re.IGNORECASE
1633
- )
1634
- modified = True
1635
 
1636
- # If we modified the HTML, update the paragraph
1637
- if modified:
1638
- # Clear the paragraph
1639
- p_tag.clear()
1640
-
1641
- # Parse the modified HTML and add it back
1642
- # Use 'html.parser' to avoid encoding issues
1643
- modified_soup = BeautifulSoup(p_html, 'html.parser')
1644
-
1645
- # Add all the parsed content back to the paragraph
1646
- for element in modified_soup:
1647
- p_tag.append(element)
1648
-
1649
- except Exception as e:
1650
- print(f"Error processing paragraph for keywords: {str(e)}")
1651
- # If there's an error, skip this paragraph
1652
- continue
1653
 
1654
  def add_natural_flow_variations(self, text):
1655
  """Add more natural flow and rhythm variations for Originality AI"""
@@ -1853,10 +2085,12 @@ class EnhancedDipperHumanizer:
1853
  result = self.post_process_html(result)
1854
 
1855
  # Final safety check for any remaining placeholders or underscores
1856
- if '__KW' in result or re.search(r'_{3,}', result) or re.search(r'\bKW\d*\b', result):
1857
- print("Warning: Found placeholders or KW patterns in final HTML output")
1858
  # Attempt to clean them with keywords
1859
- result = self.final_kw_cleanup(result, all_keywords)
 
 
1860
 
1861
  # Restore all script tags
1862
  for idx, script_content in enumerate(preserved_scripts):
 
674
  return modified_text, keyword_map
675
 
676
  def restore_keywords_robust(self, text, keyword_map):
677
+ """Restore keywords with more flexible pattern matching - ENHANCED VERSION"""
678
  if not keyword_map:
679
  return text
680
 
 
687
  # Track which positions have been replaced to avoid double replacement
688
  replaced_positions = set()
689
 
690
+ # First pass: Direct placeholder replacement
 
691
  for placeholder, keyword in keyword_map.items():
692
+ if placeholder in restored_text:
693
+ print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
694
+ restored_text = restored_text.replace(placeholder, keyword)
695
+ # Mark positions as replaced
696
+ for match in re.finditer(re.escape(keyword), restored_text):
697
+ replaced_positions.update(range(match.start(), match.end()))
698
+
699
+ # Second pass: Handle any mangled placeholders with EXPANDED patterns
 
 
 
 
700
  for placeholder, keyword in keyword_map.items():
701
+ # Extract the number from placeholder
702
  match = re.search(r'__KW(\d+)__', placeholder)
703
  if match:
704
  num = match.group(1)
705
 
706
+ # EXPANDED patterns the model might create
707
+ patterns = [
708
+ # Original patterns
709
  (f'__KW{num}__', keyword),
710
+ (f'__ KW{num}__', keyword),
711
+ (f'__KW {num}__', keyword),
712
+ (f'__ KW {num} __', keyword),
713
  (f'_KW{num}_', keyword),
714
+ (f'_kw{num}_', keyword),
715
+ (f'KW{num}', keyword),
716
+ (f'KW {num}', keyword),
717
+ (f'__kw{num}__', keyword),
718
+ (f'__Kw{num}__', keyword),
719
+ (f'__ kw{num}__', keyword),
720
+ (f'__KW{num}_', keyword),
721
+ (f'_KW{num}__', keyword),
722
+ (f'kw{num}', keyword),
723
+ (f'``KW{num}__', keyword),
724
+ (f'``KKW{num}', keyword),
725
 
726
+ # NEW patterns for common corruptions
727
  (f'KW{num}', keyword),
728
  (f'kw{num}', keyword),
729
  (f'Kw{num}', keyword),
730
+ (f'K W{num}', keyword),
731
+ (f'K w{num}', keyword),
732
+ (f'k w{num}', keyword),
733
 
734
+ # Patterns with punctuation corruption
735
+ (f'__KW{num}__.', keyword),
736
+ (f'__KW{num}__,', keyword),
737
+ (f'__KW{num}__:', keyword),
738
+ (f'__KW{num}__;', keyword),
739
+ (f'.KW{num}', keyword),
740
+ (f',KW{num}', keyword),
741
+ (f':KW{num}', keyword),
742
+ (f';KW{num}', keyword),
743
+ (f'(KW{num})', keyword),
744
+ (f'[KW{num}]', keyword),
745
+ (f'"KW{num}"', keyword),
746
+ (f"'KW{num}'", keyword),
747
 
748
+ # Patterns with special characters
749
+ (f'--KW{num}--', keyword),
750
+ (f'==KW{num}==', keyword),
751
+ (f'**KW{num}**', keyword),
752
+ (f'##KW{num}##', keyword),
753
+ (f'~~KW{num}~~', keyword),
754
+ (f'//KW{num}//', keyword),
755
+ (f'\\KW{num}\\', keyword),
756
+
757
+ # Patterns with HTML entities
758
+ (f'&lt;KW{num}&gt;', keyword),
759
+ (f'&amp;KW{num}&amp;', keyword),
760
+ (f'&#95;KW{num}&#95;', keyword),
761
 
762
+ # Patterns with case variations
763
+ (f'__kW{num}__', keyword),
764
+ (f'__Kw{num}__', keyword),
765
+ (f'__KW{num}__'.lower(), keyword),
766
+ (f'__KW{num}__'.upper(), keyword),
767
+
768
+ # Patterns with extra underscores
769
  (f'___KW{num}___', keyword),
770
  (f'____KW{num}____', keyword),
771
+ (f'_____KW{num}_____', keyword),
772
  (f'__KW{num}___', keyword),
773
  (f'___KW{num}__', keyword),
774
+
775
+ # Patterns with missing underscores
776
+ (f'_KW{num}', keyword),
777
+ (f'KW{num}_', keyword),
778
+ (f'__KW{num}', keyword),
779
+ (f'KW{num}__', keyword),
780
+
781
+ # Patterns with dots instead of underscores
782
+ (f'..KW{num}..', keyword),
783
+ (f'.KW{num}.', keyword),
784
+ (f'...KW{num}...', keyword),
785
+
786
+ # Patterns with hyphens
787
+ (f'-KW{num}-', keyword),
788
+ (f'--KW{num}', keyword),
789
+ (f'KW{num}--', keyword),
790
+
791
+ # Patterns with spaces in the number
792
+ (f'__KW {num}__', keyword),
793
+ (f'__KW {num}__', keyword),
794
+ (f'__KW {num}__', keyword),
795
+
796
+ # Patterns with partial corruption
797
+ (f'__{num}__', keyword),
798
+ (f'__K{num}__', keyword),
799
+ (f'__W{num}__', keyword),
800
+ (f'__KW{num}', keyword),
801
+ (f'KW{num}__', keyword),
802
+
803
+ # Patterns with word boundaries
804
+ (f'\\bKW{num}\\b', keyword),
805
+ (f'\\b__KW{num}__\\b', keyword),
806
+
807
+ # Patterns with newlines or tabs
808
+ (f'\\nKW{num}\\n', keyword),
809
+ (f'\\tKW{num}\\t', keyword),
810
+ (f'\\rKW{num}\\r', keyword),
811
+
812
+ # Patterns with common prefixes/suffixes
813
+ (f'theKW{num}', keyword),
814
+ (f'KW{num}the', keyword),
815
+ (f'aKW{num}', keyword),
816
+ (f'KW{num}a', keyword),
817
+ (f'andKW{num}', keyword),
818
+ (f'KW{num}and', keyword),
819
+ (f'ofKW{num}', keyword),
820
+ (f'KW{num}of', keyword),
821
+
822
+ # Patterns with concatenation
823
+ (f'KW{num}KW{num}', keyword),
824
+ (f'KWKW{num}', keyword),
825
+ (f'KW{num}{num}', keyword),
826
+
827
+ # Patterns with zero-padding variations
828
+ (f'__KW{num.zfill(3)}__', keyword),
829
+ (f'__KW{num.zfill(4)}__', keyword),
830
+ (f'__KW{num.lstrip("0")}__', keyword),
831
+
832
+ # Patterns with brackets and braces
833
+ (f'{{KW{num}}}', keyword),
834
+ (f'<KW{num}>', keyword),
835
+ (f'</KW{num}>', keyword),
836
+ (f'<KW{num}/>', keyword),
837
+
838
+ # Patterns with quotes variations
839
+ (f'`KW{num}`', keyword),
840
+ (f'```KW{num}```', keyword),
841
+ (f"'''KW{num}'''", keyword),
842
+ (f'"""KW{num}"""', keyword),
843
+
844
+ # Patterns with markdown-style formatting
845
+ (f'*KW{num}*', keyword),
846
+ (f'_KW{num}_', keyword),
847
+ (f'**KW{num}**', keyword),
848
+ (f'__KW{num}__', keyword),
849
+ (f'***KW{num}***', keyword),
850
+ (f'___KW{num}___', keyword),
851
+
852
+ # Patterns with common typos
853
+ (f'__WK{num}__', keyword),
854
+ (f'__KV{num}__', keyword),
855
+ (f'__KQ{num}__', keyword),
856
+ (f'__JW{num}__', keyword),
857
+ (f'__LW{num}__', keyword),
858
+ (f'__KE{num}__', keyword),
859
+ (f'__KR{num}__', keyword),
860
+
861
+ # Patterns with inserted characters
862
+ (f'__K-W{num}__', keyword),
863
+ (f'__K_W{num}__', keyword),
864
+ (f'__K.W{num}__', keyword),
865
+ (f'__K W{num}__', keyword),
866
+ (f'__K/W{num}__', keyword),
867
+ (f'__K\\W{num}__', keyword),
868
+
869
+ # Patterns with duplicated parts
870
+ (f'____KWKW{num}____', keyword),
871
+ (f'__KWKW{num}__', keyword),
872
+ (f'__KW{num}{num}__', keyword),
873
+ (f'__KW{num}KW{num}__', keyword),
874
+
875
+ # Patterns with reversed parts
876
+ (f'__WK{num}__', keyword),
877
+ (f'{num}KW__', keyword),
878
+ (f'__{num}KW__', keyword),
879
+
880
+ # Patterns with common OCR errors
881
+ (f'__KVV{num}__', keyword),
882
+ (f'__l<W{num}__', keyword),
883
+ (f'__l(W{num}__', keyword),
884
+ (f'__I<W{num}__', keyword),
885
+
886
+ # Patterns with unicode variations
887
+ (f'__KW{num}__', keyword),
888
+ (f'__KW{num}__', keyword),
889
+ (f'——KW{num}——', keyword),
890
+ (f'‗‗KW{num}‗‗', keyword),
891
  ]
892
 
893
+ # Apply patterns
894
+ for pattern, replacement in patterns:
895
+ if pattern in restored_text:
896
+ # Check if this position has already been replaced
897
+ start_pos = restored_text.find(pattern)
898
+ if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
899
+ print(f"Found pattern '{pattern}', replacing with {replacement}")
900
+ restored_text = restored_text.replace(pattern, replacement, 1)
901
+ # Mark new positions as replaced
902
+ for match in re.finditer(re.escape(replacement), restored_text):
903
+ replaced_positions.update(range(match.start(), match.end()))
904
+ break
905
+
906
+ # Third pass: Use regex patterns for more complex variations
907
+ for placeholder, keyword in keyword_map.items():
908
+ match = re.search(r'__KW(\d+)__', placeholder)
909
+ if match:
910
+ num = match.group(1)
911
+
912
+ # Complex regex patterns
913
+ regex_patterns = [
914
+ # Patterns with variable underscores
915
+ (r'_{1,5}KW' + num + r'_{1,5}', keyword),
916
+ (r'_{0,5}KW' + num + r'_{0,5}', keyword),
917
+
918
+ # Patterns with any characters between K and W
919
+ (r'__K.{0,3}W' + num + r'__', keyword),
920
+
921
+ # Patterns with spaces and underscores mixed
922
+ (r'[\s_]{1,5}KW' + num + r'[\s_]{1,5}', keyword),
923
+
924
+ # Patterns with case-insensitive matching
925
+ (r'(?i)__kw' + num + r'__', keyword),
926
+ (r'(?i)kw' + num, keyword),
927
+
928
+ # Patterns with word boundaries
929
+ (r'\b[_]*KW' + num + r'[_]*\b', keyword),
930
+
931
+ # Patterns with optional characters
932
+ (r'_?_?KW' + num + r'_?_?', keyword),
933
+
934
+ # Patterns with common separators
935
+ (r'[-_\.]{0,3}KW' + num + r'[-_\.]{0,3}', keyword),
936
+
937
+ # Patterns with HTML entities mixed in
938
+ (r'&[a-z]+;?KW' + num + r'&[a-z]+;?', keyword),
939
+
940
+ # Patterns for seriously mangled text
941
+ (r'.{0,3}' + num + r'.{0,3}', keyword), # Just the number with some chars
942
+
943
+ # Patterns for split placeholders
944
+ (r'__\s*KW\s*' + num + r'\s*__', keyword),
945
+ (r'_\s*_\s*K\s*W\s*' + num + r'\s*_\s*_', keyword),
946
+ ]
947
+
948
+ for pattern, replacement in regex_patterns:
949
+ matches = list(re.finditer(pattern, restored_text))
950
+ for match in matches:
951
+ start, end = match.span()
952
+ if not any(pos in replaced_positions for pos in range(start, end)):
953
+ print(f"Found regex pattern '{pattern}' at position {start}-{end}, replacing with {replacement}")
954
+ before = restored_text[:start]
955
+ after = restored_text[end:]
956
+ restored_text = before + replacement + after
957
+ replaced_positions.update(range(start, start + len(replacement)))
958
+ break
959
+
960
+ # Fourth pass: Smart underscore replacement
961
+ # Count underscores and keywords to make intelligent replacements
962
+ underscore_groups = list(re.finditer(r'_{2,}', restored_text))
963
+ remaining_keywords = [kw for kw in keyword_map.values() if kw not in restored_text]
964
+
965
+ if underscore_groups and remaining_keywords:
966
+ print(f"Found {len(underscore_groups)} underscore groups and {len(remaining_keywords)} unused keywords")
967
+
968
+ # Sort underscore groups by length (descending) to prioritize longer ones
969
+ underscore_groups.sort(key=lambda x: x.end() - x.start(), reverse=True)
970
+
971
+ for i, underscore_match in enumerate(underscore_groups):
972
+ if i < len(remaining_keywords):
973
+ start, end = underscore_match.span()
974
+ if not any(pos in replaced_positions for pos in range(start, end)):
975
+ keyword = remaining_keywords[i]
976
+ before = restored_text[:start]
977
+ after = restored_text[end:]
978
+ restored_text = before + keyword + after
979
+ replaced_positions.update(range(start, start + len(keyword)))
980
+ print(f"Replaced underscore group at {start}-{end} with keyword: {keyword}")
981
+
982
+ # Fifth pass: Context-aware replacement
983
+ # Look for patterns where keywords might make sense
984
+ for placeholder, keyword in keyword_map.items():
985
+ if keyword not in restored_text:
986
+ # Look for sentences or phrases that seem to be missing the keyword
987
+ # Common patterns where keywords might be missing
988
+ context_patterns = [
989
+ r'the\s+(?:is|are|was|were)\s+', # "the ___ is"
990
+ r'of\s+(?:the\s+)?', # "of the ___"
991
+ r'for\s+(?:the\s+)?', # "for the ___"
992
+ r'in\s+(?:the\s+)?', # "in the ___"
993
+ r'with\s+(?:the\s+)?', # "with the ___"
994
+ r'about\s+(?:the\s+)?', # "about the ___"
995
+ r'using\s+(?:the\s+)?', # "using the ___"
996
+ r'through\s+(?:the\s+)?', # "through the ___"
997
+ ]
998
+
999
+ for pattern in context_patterns:
1000
+ matches = list(re.finditer(pattern + r'([A-Z]{2,}\d+|\b\w{1,3}\b)', restored_text))
1001
+ for match in matches:
1002
+ suspicious_word = match.group(1)
1003
+ # Check if this looks like a mangled placeholder
1004
+ if re.match(r'^[A-Z]{1,3}\d+$', suspicious_word) or len(suspicious_word) <= 3:
1005
+ start = match.start(1)
1006
+ end = match.end(1)
1007
+ if not any(pos in replaced_positions for pos in range(start, end)):
1008
+ before = restored_text[:start]
1009
+ after = restored_text[end:]
1010
+ restored_text = before + keyword + after
1011
+ replaced_positions.update(range(start, start + len(keyword)))
1012
+ print(f"Context-aware replacement: replaced '{suspicious_word}' with '{keyword}'")
1013
+ break
1014
+
1015
+ # Final cleanup passes
1016
+ # Remove any remaining placeholder artifacts
1017
+ cleanup_patterns = [
1018
+ (r'``+', ''), # Remove backticks
1019
+ (r'__+', ' '), # Replace multiple underscores with space
1020
+ (r'--+', '-'), # Normalize dashes
1021
+ (r'\s{2,}', ' '), # Normalize spaces
1022
+ (r'([.,!?])\s*\1+', r'\1'), # Remove duplicate punctuation
1023
+ ]
1024
 
1025
+ for pattern, replacement in cleanup_patterns:
1026
+ restored_text = re.sub(pattern, replacement, restored_text)
 
1027
 
1028
+ # Ensure proper spacing around keywords
1029
+ for keyword in keyword_map.values():
1030
+ if keyword in restored_text:
1031
+ # Fix spacing issues around the keyword
1032
+ restored_text = re.sub(r'(\w)(' + re.escape(keyword) + r')', r'\1 \2', restored_text)
1033
+ restored_text = re.sub(r'(' + re.escape(keyword) + r')(\w)', r'\1 \2', restored_text)
1034
+ # Remove duplicate spaces
1035
+ restored_text = re.sub(r'\s+', ' ', restored_text)
1036
 
1037
  # Final verification
1038
+ for placeholder, keyword in keyword_map.items():
1039
+ if keyword not in restored_text:
1040
+ print(f"WARNING: Keyword '{keyword}' still missing from final text!")
 
 
 
 
 
 
 
 
 
 
1041
 
1042
+ # Log final result
1043
+ print(f"Final restored text: {restored_text[:100]}...")
 
 
 
 
 
 
 
 
 
1044
 
1045
+ return restored_text.strip()
1046
 
1047
  def should_skip_element(self, element, text):
1048
  """Determine if an element should be skipped from paraphrasing"""
 
1789
  return html_text
1790
 
1791
  def wrap_keywords_in_paragraphs(self, soup, keywords):
1792
+ """Wrap keywords with <strong> tags inside <p> tags only"""
1793
  if not keywords:
1794
  return
1795
 
1796
  # Find all paragraph tags
1797
  for p_tag in soup.find_all('p'):
1798
  # Skip paragraphs that are inside special elements
1799
+ # Check if paragraph is inside any of these elements
1800
  skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
1801
  'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
1802
  'div.quiz-container', 'div.question-container', 'div.results']
 
1804
  # Check if this paragraph should be skipped
1805
  should_skip = False
1806
  for parent in p_tag.parents:
1807
+ # Check by class
1808
  if parent.name == 'div' and parent.get('class'):
1809
  classes = parent.get('class', [])
1810
  if isinstance(classes, list):
 
1819
  should_skip = True
1820
  break
1821
 
1822
+ # Check by tag name
1823
  if parent.name in ['button', 'a', 'blockquote', 'details', 'summary']:
1824
  should_skip = True
1825
  break
 
1837
  if any(skip_class in p_class_str for skip_class in ['testimonial-card', 'quiz-', 'stat-']):
1838
  continue
1839
 
1840
+ # Process only if this is a regular content paragraph
1841
+ # Get all text nodes in this paragraph
1842
+ for text_node in p_tag.find_all(string=True):
1843
+ # Skip if already inside a strong or b tag
1844
+ if text_node.parent.name in ['strong', 'b', 'em', 'i', 'span', 'a']:
1845
+ continue
1846
+
1847
+ # Skip if the text node's immediate parent isn't the p tag
1848
+ # (to avoid nested elements)
1849
+ if text_node.parent != p_tag:
1850
+ continue
1851
+
1852
+ original_text = str(text_node)
1853
+
1854
+ # Skip very short text nodes
1855
+ if len(original_text.strip()) < 20:
1856
+ continue
1857
 
1858
+ modified_text = original_text
 
1859
 
1860
+ # Check each keyword
1861
  for keyword in keywords:
1862
+ # Use word boundaries for accurate matching
1863
+ pattern = r'\b' + re.escape(keyword) + r'\b'
 
1864
 
1865
+ # Find all matches (case-insensitive)
1866
+ matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
1867
 
1868
+ # Replace from end to beginning to maintain positions
1869
+ for match in reversed(matches):
1870
+ start, end = match.span()
1871
+ matched_text = match.group(0)
1872
+ # Wrap with strong tag
1873
+ modified_text = (modified_text[:start] +
1874
+ f'<strong>{matched_text}</strong>' +
1875
+ modified_text[end:])
 
1876
 
1877
+ # If text was modified, replace the text node
1878
+ if modified_text != original_text:
1879
+ # Parse the modified text to create new nodes
1880
+ new_soup = BeautifulSoup(modified_text, 'html.parser')
1881
+ # Replace the text node with the new nodes
1882
+ for new_node in reversed(new_soup.contents):
1883
+ text_node.insert_after(new_node)
1884
+ text_node.extract()
 
 
 
 
 
 
 
 
 
1885
 
1886
  def add_natural_flow_variations(self, text):
1887
  """Add more natural flow and rhythm variations for Originality AI"""
 
2085
  result = self.post_process_html(result)
2086
 
2087
  # Final safety check for any remaining placeholders or underscores
2088
+ if '__KW' in result or re.search(r'_{3,}', result):
2089
+ print("Warning: Found placeholders or multiple underscores in final HTML output")
2090
  # Attempt to clean them with keywords
2091
+ for i, keyword in enumerate(all_keywords):
2092
+ result = result.replace(f'__KW{i:03d}__', keyword)
2093
+ result = re.sub(r'_{3,}', keyword, result, count=1)
2094
 
2095
  # Restore all script tags
2096
  for idx, script_content in enumerate(preserved_scripts):