EdysorEdutech commited on
Commit
dc950b9
·
verified ·
1 Parent(s): 6b6a48c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -509
app.py CHANGED
@@ -643,7 +643,7 @@ class EnhancedDipperHumanizer:
643
  return text
644
 
645
  def preserve_keywords(self, text, keywords):
646
- """Mark keywords to preserve them during paraphrasing"""
647
  if not keywords:
648
  return text, {}
649
 
@@ -656,7 +656,7 @@ class EnhancedDipperHumanizer:
656
 
657
  for i, keyword in enumerate(sorted_keywords):
658
  # Use unique markers that won't be confused
659
- placeholder = f"__KW{i:03d}__" # e.g., __KW001__
660
 
661
  # Find all occurrences of the keyword (case-insensitive)
662
  pattern = r'\b' + re.escape(keyword) + r'\b'
@@ -669,378 +669,39 @@ class EnhancedDipperHumanizer:
669
  start, end = match.span()
670
  modified_text = modified_text[:start] + placeholder + modified_text[end:]
671
  # Store the original case version
672
- keyword_map[placeholder] = original_keyword
 
673
 
674
  return modified_text, keyword_map
675
 
676
  def restore_keywords_robust(self, text, keyword_map):
677
- """Restore keywords with more flexible pattern matching - ENHANCED VERSION"""
678
  if not keyword_map:
679
  return text
680
 
681
  restored_text = text
682
 
683
- # Debug: print what we're working with
684
- print(f"Restoring keywords in text: {restored_text[:100]}...")
685
- print(f"Keyword map: {keyword_map}")
686
-
687
- # Track which positions have been replaced to avoid double replacement
688
- replaced_positions = set()
689
-
690
- # First pass: Direct placeholder replacement
691
- for placeholder, keyword in keyword_map.items():
692
- if placeholder in restored_text:
693
- print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
694
- restored_text = restored_text.replace(placeholder, keyword)
695
- # Mark positions as replaced
696
- for match in re.finditer(re.escape(keyword), restored_text):
697
- replaced_positions.update(range(match.start(), match.end()))
698
-
699
- # Second pass: Handle any mangled placeholders with EXPANDED patterns
700
- for placeholder, keyword in keyword_map.items():
701
- # Extract the number from placeholder
702
- match = re.search(r'__KW(\d+)__', placeholder)
703
- if match:
704
- num = match.group(1)
705
-
706
- # EXPANDED patterns the model might create
707
- patterns = [
708
- # Original patterns
709
- (f'__KW{num}__', keyword),
710
- (f'__ KW{num}__', keyword),
711
- (f'__KW {num}__', keyword),
712
- (f'__ KW {num} __', keyword),
713
- (f'_KW{num}_', keyword),
714
- (f'_kw{num}_', keyword),
715
- (f'KW{num}', keyword),
716
- (f'KW {num}', keyword),
717
- (f'__kw{num}__', keyword),
718
- (f'__Kw{num}__', keyword),
719
- (f'__ kw{num}__', keyword),
720
- (f'__KW{num}_', keyword),
721
- (f'_KW{num}__', keyword),
722
- (f'kw{num}', keyword),
723
- (f'``KW{num}__', keyword),
724
- (f'``KKW{num}', keyword),
725
-
726
- # NEW patterns for common corruptions
727
- (f'KW{num}', keyword),
728
- (f'kw{num}', keyword),
729
- (f'Kw{num}', keyword),
730
- (f'K W{num}', keyword),
731
- (f'K w{num}', keyword),
732
- (f'k w{num}', keyword),
733
-
734
- # Patterns with punctuation corruption
735
- (f'__KW{num}__.', keyword),
736
- (f'__KW{num}__,', keyword),
737
- (f'__KW{num}__:', keyword),
738
- (f'__KW{num}__;', keyword),
739
- (f'.KW{num}', keyword),
740
- (f',KW{num}', keyword),
741
- (f':KW{num}', keyword),
742
- (f';KW{num}', keyword),
743
- (f'(KW{num})', keyword),
744
- (f'[KW{num}]', keyword),
745
- (f'"KW{num}"', keyword),
746
- (f"'KW{num}'", keyword),
747
-
748
- # Patterns with special characters
749
- (f'--KW{num}--', keyword),
750
- (f'==KW{num}==', keyword),
751
- (f'**KW{num}**', keyword),
752
- (f'##KW{num}##', keyword),
753
- (f'~~KW{num}~~', keyword),
754
- (f'//KW{num}//', keyword),
755
- (f'\\KW{num}\\', keyword),
756
-
757
- # Patterns with HTML entities
758
- (f'<KW{num}>', keyword),
759
- (f'&KW{num}&', keyword),
760
- (f'_KW{num}_', keyword),
761
-
762
- # Patterns with case variations
763
- (f'__kW{num}__', keyword),
764
- (f'__Kw{num}__', keyword),
765
- (f'__KW{num}__'.lower(), keyword),
766
- (f'__KW{num}__'.upper(), keyword),
767
-
768
- # Patterns with extra underscores
769
- (f'___KW{num}___', keyword),
770
- (f'____KW{num}____', keyword),
771
- (f'_____KW{num}_____', keyword),
772
- (f'__KW{num}___', keyword),
773
- (f'___KW{num}__', keyword),
774
-
775
- # Patterns with missing underscores
776
- (f'_KW{num}', keyword),
777
- (f'KW{num}_', keyword),
778
- (f'__KW{num}', keyword),
779
- (f'KW{num}__', keyword),
780
-
781
- # Patterns with dots instead of underscores
782
- (f'..KW{num}..', keyword),
783
- (f'.KW{num}.', keyword),
784
- (f'...KW{num}...', keyword),
785
-
786
- # Patterns with hyphens
787
- (f'-KW{num}-', keyword),
788
- (f'--KW{num}', keyword),
789
- (f'KW{num}--', keyword),
790
-
791
- # Patterns with spaces in the number
792
- (f'__KW {num}__', keyword),
793
- (f'__KW {num}__', keyword),
794
- (f'__KW {num}__', keyword),
795
-
796
- # Patterns with partial corruption
797
- (f'__{num}__', keyword),
798
- (f'__K{num}__', keyword),
799
- (f'__W{num}__', keyword),
800
- (f'__KW{num}', keyword),
801
- (f'KW{num}__', keyword),
802
-
803
- # Patterns with word boundaries
804
- (f'\\bKW{num}\\b', keyword),
805
- (f'\\b__KW{num}__\\b', keyword),
806
-
807
- # Patterns with newlines or tabs
808
- (f'\\nKW{num}\\n', keyword),
809
- (f'\\tKW{num}\\t', keyword),
810
- (f'\\rKW{num}\\r', keyword),
811
-
812
- # Patterns with common prefixes/suffixes
813
- (f'theKW{num}', keyword),
814
- (f'KW{num}the', keyword),
815
- (f'aKW{num}', keyword),
816
- (f'KW{num}a', keyword),
817
- (f'andKW{num}', keyword),
818
- (f'KW{num}and', keyword),
819
- (f'ofKW{num}', keyword),
820
- (f'KW{num}of', keyword),
821
-
822
- # Patterns with concatenation
823
- (f'KW{num}KW{num}', keyword),
824
- (f'KWKW{num}', keyword),
825
- (f'KW{num}{num}', keyword),
826
-
827
- # Patterns with zero-padding variations
828
- (f'__KW{num.zfill(3)}__', keyword),
829
- (f'__KW{num.zfill(4)}__', keyword),
830
- (f'__KW{num.lstrip("0")}__', keyword),
831
-
832
- # Patterns with brackets and braces
833
- (f'{{KW{num}}}', keyword),
834
- (f'<KW{num}>', keyword),
835
- (f'</KW{num}>', keyword),
836
- (f'<KW{num}/>', keyword),
837
-
838
- # Patterns with quotes variations
839
- (f'`KW{num}`', keyword),
840
- (f'```KW{num}```', keyword),
841
- (f"'''KW{num}'''", keyword),
842
- (f'"""KW{num}"""', keyword),
843
-
844
- # Patterns with markdown-style formatting
845
- (f'*KW{num}*', keyword),
846
- (f'_KW{num}_', keyword),
847
- (f'**KW{num}**', keyword),
848
- (f'__KW{num}__', keyword),
849
- (f'***KW{num}***', keyword),
850
- (f'___KW{num}___', keyword),
851
-
852
- # Patterns with common typos
853
- (f'__WK{num}__', keyword),
854
- (f'__KV{num}__', keyword),
855
- (f'__KQ{num}__', keyword),
856
- (f'__JW{num}__', keyword),
857
- (f'__LW{num}__', keyword),
858
- (f'__KE{num}__', keyword),
859
- (f'__KR{num}__', keyword),
860
-
861
- # Patterns with inserted characters
862
- (f'__K-W{num}__', keyword),
863
- (f'__K_W{num}__', keyword),
864
- (f'__K.W{num}__', keyword),
865
- (f'__K W{num}__', keyword),
866
- (f'__K/W{num}__', keyword),
867
- (f'__K\\W{num}__', keyword),
868
-
869
- # Patterns with duplicated parts
870
- (f'____KWKW{num}____', keyword),
871
- (f'__KWKW{num}__', keyword),
872
- (f'__KW{num}{num}__', keyword),
873
- (f'__KW{num}KW{num}__', keyword),
874
-
875
- # Patterns with reversed parts
876
- (f'__WK{num}__', keyword),
877
- (f'{num}KW__', keyword),
878
- (f'__{num}KW__', keyword),
879
-
880
- # Patterns with common OCR errors
881
- (f'__KVV{num}__', keyword),
882
- (f'__l<W{num}__', keyword),
883
- (f'__l(W{num}__', keyword),
884
- (f'__I<W{num}__', keyword),
885
-
886
- # Patterns with unicode variations
887
- (f'__KW{num}__', keyword),
888
- (f'__KW{num}__', keyword),
889
- (f'——KW{num}——', keyword),
890
- (f'‗‗KW{num}‗‗', keyword),
891
- ]
892
-
893
- # Apply patterns
894
- for pattern, replacement in patterns:
895
- if pattern in restored_text:
896
- # Check if this position has already been replaced
897
- start_pos = restored_text.find(pattern)
898
- if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
899
- print(f"Found pattern '{pattern}', replacing with {replacement}")
900
- restored_text = restored_text.replace(pattern, replacement, 1)
901
- # Mark new positions as replaced
902
- for match in re.finditer(re.escape(replacement), restored_text):
903
- replaced_positions.update(range(match.start(), match.end()))
904
- break
905
-
906
- # Third pass: Use regex patterns for more complex variations
907
- for placeholder, keyword in keyword_map.items():
908
- match = re.search(r'__KW(\d+)__', placeholder)
909
- if match:
910
- num = match.group(1)
911
-
912
- # Complex regex patterns
913
- regex_patterns = [
914
- # Patterns with variable underscores
915
- (r'_{1,5}KW' + num + r'_{1,5}', keyword),
916
- (r'_{0,5}KW' + num + r'_{0,5}', keyword),
917
-
918
- # Patterns with any characters between K and W
919
- (r'__K.{0,3}W' + num + r'__', keyword),
920
-
921
- # Patterns with spaces and underscores mixed
922
- (r'[\s_]{1,5}KW' + num + r'[\s_]{1,5}', keyword),
923
-
924
- # Patterns with case-insensitive matching
925
- (r'(?i)__kw' + num + r'__', keyword),
926
- (r'(?i)kw' + num, keyword),
927
-
928
- # Patterns with word boundaries
929
- (r'\b[_]*KW' + num + r'[_]*\b', keyword),
930
-
931
- # Patterns with optional characters
932
- (r'_?_?KW' + num + r'_?_?', keyword),
933
-
934
- # Patterns with common separators
935
- (r'[-_\.]{0,3}KW' + num + r'[-_\.]{0,3}', keyword),
936
-
937
- # Patterns with HTML entities mixed in
938
- (r'&[a-z]+;?KW' + num + r'&[a-z]+;?', keyword),
939
-
940
- # Patterns for seriously mangled text
941
- (r'.{0,3}' + num + r'.{0,3}', keyword), # Just the number with some chars
942
-
943
- # Patterns for split placeholders
944
- (r'__\s*KW\s*' + num + r'\s*__', keyword),
945
- (r'_\s*_\s*K\s*W\s*' + num + r'\s*_\s*_', keyword),
946
- ]
947
-
948
- for pattern, replacement in regex_patterns:
949
- matches = list(re.finditer(pattern, restored_text))
950
- for match in matches:
951
- start, end = match.span()
952
- if not any(pos in replaced_positions for pos in range(start, end)):
953
- print(f"Found regex pattern '{pattern}' at position {start}-{end}, replacing with {replacement}")
954
- before = restored_text[:start]
955
- after = restored_text[end:]
956
- restored_text = before + replacement + after
957
- replaced_positions.update(range(start, start + len(replacement)))
958
- break
959
-
960
- # Fourth pass: Smart underscore replacement
961
- # Count underscores and keywords to make intelligent replacements
962
- underscore_groups = list(re.finditer(r'_{2,}', restored_text))
963
- remaining_keywords = [kw for kw in keyword_map.values() if kw not in restored_text]
964
-
965
- if underscore_groups and remaining_keywords:
966
- print(f"Found {len(underscore_groups)} underscore groups and {len(remaining_keywords)} unused keywords")
967
-
968
- # Sort underscore groups by length (descending) to prioritize longer ones
969
- underscore_groups.sort(key=lambda x: x.end() - x.start(), reverse=True)
970
-
971
- for i, underscore_match in enumerate(underscore_groups):
972
- if i < len(remaining_keywords):
973
- start, end = underscore_match.span()
974
- if not any(pos in replaced_positions for pos in range(start, end)):
975
- keyword = remaining_keywords[i]
976
- before = restored_text[:start]
977
- after = restored_text[end:]
978
- restored_text = before + keyword + after
979
- replaced_positions.update(range(start, start + len(keyword)))
980
- print(f"Replaced underscore group at {start}-{end} with keyword: {keyword}")
981
-
982
- # Fifth pass: Context-aware replacement
983
- # Look for patterns where keywords might make sense
984
  for placeholder, keyword in keyword_map.items():
985
- if keyword not in restored_text:
986
- # Look for sentences or phrases that seem to be missing the keyword
987
- # Common patterns where keywords might be missing
988
- context_patterns = [
989
- r'the\s+(?:is|are|was|were)\s+', # "the ___ is"
990
- r'of\s+(?:the\s+)?', # "of the ___"
991
- r'for\s+(?:the\s+)?', # "for the ___"
992
- r'in\s+(?:the\s+)?', # "in the ___"
993
- r'with\s+(?:the\s+)?', # "with the ___"
994
- r'about\s+(?:the\s+)?', # "about the ___"
995
- r'using\s+(?:the\s+)?', # "using the ___"
996
- r'through\s+(?:the\s+)?', # "through the ___"
997
- ]
998
-
999
- for pattern in context_patterns:
1000
- matches = list(re.finditer(pattern + r'([A-Z]{2,}\d+|\b\w{1,3}\b)', restored_text))
1001
- for match in matches:
1002
- suspicious_word = match.group(1)
1003
- # Check if this looks like a mangled placeholder
1004
- if re.match(r'^[A-Z]{1,3}\d+$', suspicious_word) or len(suspicious_word) <= 3:
1005
- start = match.start(1)
1006
- end = match.end(1)
1007
- if not any(pos in replaced_positions for pos in range(start, end)):
1008
- before = restored_text[:start]
1009
- after = restored_text[end:]
1010
- restored_text = before + keyword + after
1011
- replaced_positions.update(range(start, start + len(keyword)))
1012
- print(f"Context-aware replacement: replaced '{suspicious_word}' with '{keyword}'")
1013
- break
1014
-
1015
- # Final cleanup passes
1016
- # Remove any remaining placeholder artifacts
1017
- cleanup_patterns = [
1018
- (r'``+', ''), # Remove backticks
1019
- (r'__+', ' '), # Replace multiple underscores with space
1020
- (r'--+', '-'), # Normalize dashes
1021
- (r'\s{2,}', ' '), # Normalize spaces
1022
- (r'([.,!?])\s*\1+', r'\1'), # Remove duplicate punctuation
1023
- ]
1024
-
1025
- for pattern, replacement in cleanup_patterns:
1026
- restored_text = re.sub(pattern, replacement, restored_text)
1027
-
1028
- # Ensure proper spacing around keywords
1029
- for keyword in keyword_map.values():
1030
- if keyword in restored_text:
1031
- # Fix spacing issues around the keyword
1032
- restored_text = re.sub(r'(\w)(' + re.escape(keyword) + r')', r'\1 \2', restored_text)
1033
- restored_text = re.sub(r'(' + re.escape(keyword) + r')(\w)', r'\1 \2', restored_text)
1034
- # Remove duplicate spaces
1035
- restored_text = re.sub(r'\s+', ' ', restored_text)
1036
-
1037
- # Final verification
1038
- for placeholder, keyword in keyword_map.items():
1039
- if keyword not in restored_text:
1040
- print(f"WARNING: Keyword '{keyword}' still missing from final text!")
1041
-
1042
- # Log final result
1043
- print(f"Final restored text: {restored_text[:100]}...")
1044
 
1045
  return restored_text.strip()
1046
 
@@ -1235,7 +896,7 @@ class EnhancedDipperHumanizer:
1235
 
1236
  # Remove leading non-letter characters carefully
1237
  # IMPORTANT: Preserve keyword placeholders
1238
- if not re.match(r'^(__KW\d+__|KW\d+)', text):
1239
  # Only remove if it doesn't start with a placeholder
1240
  text = re.sub(r'^[^a-zA-Z_]+', '', text)
1241
 
@@ -1253,11 +914,6 @@ class EnhancedDipperHumanizer:
1253
  # Preserve keywords
1254
  text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
1255
 
1256
- # Add debug logging
1257
- if keyword_map:
1258
- print(f"Debug: Created keyword map: {keyword_map}")
1259
- print(f"Debug: Text with placeholders: {text_with_placeholders[:100]}...")
1260
-
1261
  # Split into sentences for better control
1262
  sentences = self.split_into_sentences_advanced(text_with_placeholders)
1263
  paraphrased_sentences = []
@@ -1361,25 +1017,9 @@ class EnhancedDipperHumanizer:
1361
  # Join sentences back
1362
  result = ' '.join(paraphrased_sentences)
1363
 
1364
- # Debug before restoration
1365
- if keyword_map:
1366
- print(f"Debug: Result before restoration: {result[:100]}...")
1367
- print(f"Debug: Checking for placeholders...")
1368
- for placeholder in keyword_map.keys():
1369
- if placeholder in result:
1370
- print(f"Debug: Found placeholder {placeholder} in result")
1371
- else:
1372
- # Check for mangled versions
1373
- if '___' in result:
1374
- print(f"Debug: Found underscores ___ instead of {placeholder}")
1375
-
1376
  # Restore keywords AFTER joining all sentences
1377
  result = self.restore_keywords_robust(result, keyword_map)
1378
 
1379
- # Debug after restoration
1380
- if keyword_map:
1381
- print(f"Debug: Result after restoration: {result[:100]}...")
1382
-
1383
  # Apply natural human patterns
1384
  result = self.add_natural_human_patterns(result)
1385
 
@@ -1450,7 +1090,7 @@ class EnhancedDipperHumanizer:
1450
 
1451
  # Ensure first letter is capitalized ONLY if it's sentence start
1452
  # Don't capitalize words like "iPhone" or "eBay" or placeholders
1453
- if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('__KW'):
1454
  generated = generated[0].upper() + generated[1:]
1455
 
1456
  return generated
@@ -1640,8 +1280,7 @@ class EnhancedDipperHumanizer:
1640
  # Check if it's not an acronym or proper noun that should stay lowercase
1641
  if (first_word[0].islower() and
1642
  not self.is_likely_acronym_or_proper_noun(first_word) and
1643
- not first_word.startswith('__KW') and
1644
- not first_word.startswith('_kw')):
1645
  # Only capitalize if it's a regular word
1646
  sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
1647
 
@@ -1788,100 +1427,56 @@ class EnhancedDipperHumanizer:
1788
 
1789
  return html_text
1790
 
1791
- def wrap_keywords_in_paragraphs(self, soup, keywords):
1792
- """Wrap keywords with <strong> tags inside <p> tags only"""
1793
  if not keywords:
1794
- return
1795
-
1796
- # Find all paragraph tags
1797
- for p_tag in soup.find_all('p'):
1798
- # Skip paragraphs that are inside special elements
1799
- # Check if paragraph is inside any of these elements
1800
- skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
1801
- 'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
1802
- 'div.quiz-container', 'div.question-container', 'div.results']
1803
-
1804
- # Check if this paragraph should be skipped
1805
- should_skip = False
1806
- for parent in p_tag.parents:
1807
- # Check by class
1808
- if parent.name == 'div' and parent.get('class'):
1809
- classes = parent.get('class', [])
1810
- if isinstance(classes, list):
1811
- class_str = ' '.join(str(cls) for cls in classes)
1812
- else:
1813
- class_str = str(classes)
1814
-
1815
- if any(skip_class in class_str for skip_class in
1816
- ['author-intro', 'cta-box', 'testimonial-card', 'news-box',
1817
- 'quiz-container', 'question-container', 'results', 'stats-grid',
1818
- 'toc-', 'comparison-tables']):
1819
- should_skip = True
1820
- break
1821
-
1822
- # Check by tag name
1823
- if parent.name in ['button', 'a', 'blockquote', 'details', 'summary']:
1824
- should_skip = True
1825
- break
1826
-
1827
- if should_skip:
1828
- continue
1829
-
1830
- # Additional check: Skip if paragraph has specific classes
1831
- p_classes = p_tag.get('class', [])
1832
- if isinstance(p_classes, list):
1833
- p_class_str = ' '.join(str(cls) for cls in p_classes)
1834
- else:
1835
- p_class_str = str(p_classes)
1836
-
1837
- if any(skip_class in p_class_str for skip_class in ['testimonial-card', 'quiz-', 'stat-']):
1838
- continue
1839
-
1840
- # Process only if this is a regular content paragraph
1841
- # Get all text nodes in this paragraph
1842
- for text_node in p_tag.find_all(string=True):
1843
- # Skip if already inside a strong or b tag
1844
- if text_node.parent.name in ['strong', 'b', 'em', 'i', 'span', 'a']:
1845
  continue
1846
 
1847
- # Skip if the text node's immediate parent isn't the p tag
1848
- # (to avoid nested elements)
1849
- if text_node.parent != p_tag:
1850
- continue
1851
 
1852
- original_text = str(text_node)
 
 
1853
 
1854
- # Skip very short text nodes
1855
- if len(original_text.strip()) < 20:
1856
- continue
1857
 
1858
- modified_text = original_text
1859
-
1860
- # Check each keyword
1861
- for keyword in keywords:
1862
- # Use word boundaries for accurate matching
1863
- pattern = r'\b' + re.escape(keyword) + r'\b'
1864
 
1865
- # Find all matches (case-insensitive)
1866
- matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
 
 
 
 
1867
 
1868
- # Replace from end to beginning to maintain positions
1869
- for match in reversed(matches):
1870
- start, end = match.span()
1871
- matched_text = match.group(0)
1872
- # Wrap with strong tag
1873
- modified_text = (modified_text[:start] +
1874
- f'<strong>{matched_text}</strong>' +
1875
- modified_text[end:])
1876
-
1877
- # If text was modified, replace the text node
1878
- if modified_text != original_text:
1879
- # Parse the modified text to create new nodes
1880
- new_soup = BeautifulSoup(modified_text, 'html.parser')
1881
- # Replace the text node with the new nodes
1882
- for new_node in reversed(new_soup.contents):
1883
- text_node.insert_after(new_node)
1884
- text_node.extract()
1885
 
1886
  def add_natural_flow_variations(self, text):
1887
  """Add more natural flow and rhythm variations for Originality AI"""
@@ -2020,26 +1615,12 @@ class EnhancedDipperHumanizer:
2020
  if len(original_text.split()) < 3:
2021
  continue
2022
 
2023
- # Debug: Check if keywords are in this text
2024
- text_has_keywords = any(keyword.lower() in original_text.lower() for keyword in all_keywords)
2025
- if text_has_keywords:
2026
- print(f"Debug: Processing text with keywords: {original_text[:50]}...")
2027
-
2028
  # First pass with Dipper (with adjusted diversity)
2029
  paraphrased_text = self.paraphrase_with_dipper(
2030
  original_text,
2031
  keywords=all_keywords
2032
  )
2033
 
2034
- # Verify no placeholders remain
2035
- if '__KW' in paraphrased_text or '___' in paraphrased_text:
2036
- print(f"Warning: Placeholder or underscores found in paraphrased text: {paraphrased_text[:100]}...")
2037
- # Try to restore again with the enhanced function
2038
- temp_map = {}
2039
- for j, keyword in enumerate(all_keywords):
2040
- temp_map[f'__KW{j:03d}__'] = keyword
2041
- paraphrased_text = self.restore_keywords_robust(paraphrased_text, temp_map)
2042
-
2043
  # Second pass with BART for longer texts (increased probability)
2044
  if self.use_bart and len(paraphrased_text.split()) > 8:
2045
  # 50% chance to use BART for more variation (reduced from 60%)
@@ -2058,12 +1639,6 @@ class EnhancedDipperHumanizer:
2058
  # Fix punctuation and formatting
2059
  paraphrased_text = self.fix_punctuation(paraphrased_text)
2060
 
2061
- # Final check for any remaining placeholders or underscores
2062
- if '___' in paraphrased_text or '__KW' in paraphrased_text:
2063
- print(f"Error: Unresolved placeholders in final text")
2064
- # Use original text if we can't resolve placeholders
2065
- paraphrased_text = original_text
2066
-
2067
  # Final quality check
2068
  if paraphrased_text and len(paraphrased_text.split()) >= 3:
2069
  element_info['element'].replace_with(NavigableString(paraphrased_text))
@@ -2077,20 +1652,8 @@ class EnhancedDipperHumanizer:
2077
  progress = (i + 1) / total_elements * 100
2078
  print(f"Progress: {progress:.1f}%")
2079
 
2080
- # Wrap keywords with <strong> tags in paragraphs
2081
- self.wrap_keywords_in_paragraphs(soup, all_keywords)
2082
-
2083
- # Post-process the entire HTML to fix bold/strong formatting
2084
  result = str(soup)
2085
- result = self.post_process_html(result)
2086
-
2087
- # Final safety check for any remaining placeholders or underscores
2088
- if '__KW' in result or re.search(r'_{3,}', result):
2089
- print("Warning: Found placeholders or multiple underscores in final HTML output")
2090
- # Attempt to clean them with keywords
2091
- for i, keyword in enumerate(all_keywords):
2092
- result = result.replace(f'__KW{i:03d}__', keyword)
2093
- result = re.sub(r'_{3,}', keyword, result, count=1)
2094
 
2095
  # Restore all script tags
2096
  for idx, script_content in enumerate(preserved_scripts):
@@ -2102,6 +1665,12 @@ class EnhancedDipperHumanizer:
2102
  placeholder = style_placeholder.format(idx)
2103
  result = result.replace(placeholder, style_content)
2104
 
 
 
 
 
 
 
2105
  # Validate and fix HTML syntax
2106
  result = self.validate_and_fix_html(result)
2107
 
@@ -2293,7 +1862,7 @@ iface = gr.Interface(
2293
  - Stream-of-consciousness elements and rhetorical questions
2294
  - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
2295
  - Fixed placeholder system that preserves keywords
2296
- - Keywords inside <p> tags are automatically wrapped with <strong> tags
2297
  - Skips content in <strong>, <b>, and heading tags (including inside tables)
2298
  - Designed to pass the strictest AI detection systems
2299
 
 
643
  return text
644
 
645
  def preserve_keywords(self, text, keywords):
646
+ """Mark keywords to preserve them during paraphrasing - SIMPLIFIED"""
647
  if not keywords:
648
  return text, {}
649
 
 
656
 
657
  for i, keyword in enumerate(sorted_keywords):
658
  # Use unique markers that won't be confused
659
+ placeholder = f"KWPH{i:04d}" # e.g., KWPH0001
660
 
661
  # Find all occurrences of the keyword (case-insensitive)
662
  pattern = r'\b' + re.escape(keyword) + r'\b'
 
669
  start, end = match.span()
670
  modified_text = modified_text[:start] + placeholder + modified_text[end:]
671
  # Store the original case version
672
+ if placeholder not in keyword_map:
673
+ keyword_map[placeholder] = original_keyword
674
 
675
  return modified_text, keyword_map
676
 
677
  def restore_keywords_robust(self, text, keyword_map):
678
+ """Restore keywords with simple direct replacement"""
679
  if not keyword_map:
680
  return text
681
 
682
  restored_text = text
683
 
684
+ # Simple direct replacement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
  for placeholder, keyword in keyword_map.items():
686
+ # Direct replacement
687
+ restored_text = restored_text.replace(placeholder, keyword)
688
+
689
+ # Also try with potential variations that might occur
690
+ restored_text = restored_text.replace(f" {placeholder} ", f" {keyword} ")
691
+ restored_text = restored_text.replace(f"{placeholder}.", f"{keyword}.")
692
+ restored_text = restored_text.replace(f"{placeholder},", f"{keyword},")
693
+ restored_text = restored_text.replace(f"{placeholder}!", f"{keyword}!")
694
+ restored_text = restored_text.replace(f"{placeholder}?", f"{keyword}?")
695
+ restored_text = restored_text.replace(f"{placeholder}:", f"{keyword}:")
696
+ restored_text = restored_text.replace(f"{placeholder};", f"{keyword};")
697
+ restored_text = restored_text.replace(f"({placeholder})", f"({keyword})")
698
+ restored_text = restored_text.replace(f'"{placeholder}"', f'"{keyword}"')
699
+ restored_text = restored_text.replace(f"'{placeholder}'", f"'{keyword}'")
700
+
701
+ # Handle case variations
702
+ restored_text = restored_text.replace(placeholder.lower(), keyword)
703
+ restored_text = restored_text.replace(placeholder.upper(), keyword)
704
+ restored_text = restored_text.replace(placeholder.capitalize(), keyword)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
  return restored_text.strip()
707
 
 
896
 
897
  # Remove leading non-letter characters carefully
898
  # IMPORTANT: Preserve keyword placeholders
899
+ if not re.match(r'^(KWPH\d+)', text):
900
  # Only remove if it doesn't start with a placeholder
901
  text = re.sub(r'^[^a-zA-Z_]+', '', text)
902
 
 
914
  # Preserve keywords
915
  text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
916
 
 
 
 
 
 
917
  # Split into sentences for better control
918
  sentences = self.split_into_sentences_advanced(text_with_placeholders)
919
  paraphrased_sentences = []
 
1017
  # Join sentences back
1018
  result = ' '.join(paraphrased_sentences)
1019
 
 
 
 
 
 
 
 
 
 
 
 
 
1020
  # Restore keywords AFTER joining all sentences
1021
  result = self.restore_keywords_robust(result, keyword_map)
1022
 
 
 
 
 
1023
  # Apply natural human patterns
1024
  result = self.add_natural_human_patterns(result)
1025
 
 
1090
 
1091
  # Ensure first letter is capitalized ONLY if it's sentence start
1092
  # Don't capitalize words like "iPhone" or "eBay" or placeholders
1093
+ if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('KWPH'):
1094
  generated = generated[0].upper() + generated[1:]
1095
 
1096
  return generated
 
1280
  # Check if it's not an acronym or proper noun that should stay lowercase
1281
  if (first_word[0].islower() and
1282
  not self.is_likely_acronym_or_proper_noun(first_word) and
1283
+ not first_word.startswith('KWPH')):
 
1284
  # Only capitalize if it's a regular word
1285
  sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
1286
 
 
1427
 
1428
  return html_text
1429
 
1430
+ def wrap_keywords_in_bold(self, html_content, keywords):
1431
+ """Wrap all keyword occurrences with <strong> tags - FIXED VERSION"""
1432
  if not keywords:
1433
+ return html_content
1434
+
1435
+ # Parse the HTML
1436
+ soup = BeautifulSoup(html_content, 'html.parser')
1437
+
1438
+ # Process each keyword
1439
+ for keyword in keywords:
1440
+ # Find all text nodes that contain this keyword
1441
+ for element in soup.find_all(string=re.compile(re.escape(keyword), re.IGNORECASE)):
1442
+ # Skip if already inside certain tags
1443
+ parent = element.parent
1444
+ if parent and parent.name in ['script', 'style', 'strong', 'b', 'a', 'button',
1445
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1446
  continue
1447
 
1448
+ # Get the text content
1449
+ text = str(element)
 
 
1450
 
1451
+ # Replace all occurrences of the keyword with <strong> wrapped version
1452
+ # Use a regex to preserve the original case
1453
+ pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
1454
 
1455
+ # Find all matches
1456
+ matches = list(pattern.finditer(text))
 
1457
 
1458
+ if matches:
1459
+ # Build the new text with <strong> tags
1460
+ new_text = ""
1461
+ last_end = 0
 
 
1462
 
1463
+ for match in matches:
1464
+ # Add text before the match
1465
+ new_text += text[last_end:match.start()]
1466
+ # Add the wrapped keyword (preserving original case)
1467
+ new_text += f"<strong>{match.group(0)}</strong>"
1468
+ last_end = match.end()
1469
 
1470
+ # Add remaining text
1471
+ new_text += text[last_end:]
1472
+
1473
+ # Replace the text node with new HTML
1474
+ new_soup = BeautifulSoup(new_text, 'html.parser')
1475
+ for new_element in reversed(list(new_soup.contents)):
1476
+ element.insert_after(new_element)
1477
+ element.extract()
1478
+
1479
+ return str(soup)
 
 
 
 
 
 
 
1480
 
1481
  def add_natural_flow_variations(self, text):
1482
  """Add more natural flow and rhythm variations for Originality AI"""
 
1615
  if len(original_text.split()) < 3:
1616
  continue
1617
 
 
 
 
 
 
1618
  # First pass with Dipper (with adjusted diversity)
1619
  paraphrased_text = self.paraphrase_with_dipper(
1620
  original_text,
1621
  keywords=all_keywords
1622
  )
1623
 
 
 
 
 
 
 
 
 
 
1624
  # Second pass with BART for longer texts (increased probability)
1625
  if self.use_bart and len(paraphrased_text.split()) > 8:
1626
  # 50% chance to use BART for more variation (reduced from 60%)
 
1639
  # Fix punctuation and formatting
1640
  paraphrased_text = self.fix_punctuation(paraphrased_text)
1641
 
 
 
 
 
 
 
1642
  # Final quality check
1643
  if paraphrased_text and len(paraphrased_text.split()) >= 3:
1644
  element_info['element'].replace_with(NavigableString(paraphrased_text))
 
1652
  progress = (i + 1) / total_elements * 100
1653
  print(f"Progress: {progress:.1f}%")
1654
 
1655
+ # Get the processed HTML
 
 
 
1656
  result = str(soup)
 
 
 
 
 
 
 
 
 
1657
 
1658
  # Restore all script tags
1659
  for idx, script_content in enumerate(preserved_scripts):
 
1665
  placeholder = style_placeholder.format(idx)
1666
  result = result.replace(placeholder, style_content)
1667
 
1668
+ # NOW wrap keywords in bold tags after all processing is complete
1669
+ result = self.wrap_keywords_in_bold(result, all_keywords)
1670
+
1671
+ # Post-process the entire HTML to fix bold/strong formatting
1672
+ result = self.post_process_html(result)
1673
+
1674
  # Validate and fix HTML syntax
1675
  result = self.validate_and_fix_html(result)
1676
 
 
1862
  - Stream-of-consciousness elements and rhetorical questions
1863
  - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
1864
  - Fixed placeholder system that preserves keywords
1865
+ - Keywords are automatically wrapped with <strong> tags
1866
  - Skips content in <strong>, <b>, and heading tags (including inside tables)
1867
  - Designed to pass the strictest AI detection systems
1868