EdysorEdutech commited on
Commit
5e5997d
·
verified ·
1 Parent(s): 17c0697

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -75
app.py CHANGED
@@ -603,11 +603,18 @@ class EnhancedDipperHumanizer:
603
  print(f"Restoring keywords in text: {restored_text[:100]}...")
604
  print(f"Keyword map: {keyword_map}")
605
 
 
 
 
606
  # First pass: Direct placeholder replacement
607
  for placeholder, keyword in keyword_map.items():
608
  if placeholder in restored_text:
609
  print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
610
  restored_text = restored_text.replace(placeholder, keyword)
 
 
 
 
611
 
612
  # Second pass: Handle any mangled placeholders
613
  # The model might alter placeholders in various ways
@@ -619,83 +626,67 @@ class EnhancedDipperHumanizer:
619
 
620
  # Various patterns the model might create
621
  patterns = [
622
- f'__KW{num}__',
623
- f'__ KW{num}__',
624
- f'__KW {num}__',
625
- f'__ KW {num} __',
626
- f'_KW{num}_',
627
- f'_kw{num}_', # lowercase with single underscore
628
- f'KW{num}',
629
- f'KW {num}',
630
- f'__kw{num}__', # lowercase variant
631
- f'__Kw{num}__', # mixed case
632
- f'__ kw{num}__',
633
- f'__KW{num}_', # missing underscore
634
- f'_KW{num}__', # missing underscore
635
- f'kw{num}', # just lowercase
636
- f'___', # Sometimes model reduces to just underscores
637
- f'____', # Various underscore patterns
638
- f'_____',
639
- f'__ __',
640
- f'___ ___',
641
  ]
642
 
643
- for pattern in patterns:
644
  if pattern in restored_text:
645
- print(f"Found pattern '{pattern}', replacing with {keyword}")
646
- restored_text = restored_text.replace(pattern, keyword)
647
-
648
- # Third pass: Use regex to catch any remaining variations
649
- # This catches cases where the model might have added characters
650
- for placeholder, keyword in keyword_map.items():
651
- match = re.search(r'__KW(\d+)__', placeholder)
652
- if match:
653
- num = match.group(1)
654
- # Regex to match various mangled versions including single underscore
655
- regex_patterns = [
656
- rf'_+\s*[Kk][Ww]\s*{num}\s*_*', # Any underscores, case insensitive
657
- rf'[Kk][Ww]\s*{num}(?!\d)', # KW followed by the number
658
- rf'__?\s*[Kk][Ww]\s*{num}\s*__?', # Optional underscores
659
- rf'_[Kk][Ww]{num}_', # Single underscore version
660
- rf'_+\s*{num}\s*_*', # Just the number with underscores
661
- rf'__+', # Multiple underscores (fallback)
662
- ]
663
-
664
- for pattern in regex_patterns:
665
- matches = list(re.finditer(pattern, restored_text, flags=re.IGNORECASE))
666
- if matches:
667
- print(f"Found regex pattern '{pattern}' {len(matches)} times")
668
- # Replace from end to beginning to maintain positions
669
- for match in reversed(matches):
670
- restored_text = restored_text[:match.start()] + keyword + restored_text[match.end():]
671
-
672
- # Fourth pass: Look for common patterns where model mangles placeholders
673
- # Sometimes the model turns __KW002__ into things like "___ University" or "___ College__"
674
- underscore_patterns = [
675
- (r'___+\s*[Uu]niversity', keyword + ' University') if 'universit' in keyword.lower() else None,
676
- (r'___+\s*[Cc]ollege__?', keyword + ' College') if 'college' in keyword.lower() else None,
677
- (r'___+\s*[Ss]chool', keyword + ' School') if 'school' in keyword.lower() else None,
678
- (r'___+', keyword), # Generic underscore replacement
679
- ]
680
-
681
- for pattern_tuple in underscore_patterns:
682
- if pattern_tuple:
683
- pattern, replacement = pattern_tuple
684
- if re.search(pattern, restored_text):
685
- print(f"Found underscore pattern '{pattern}', replacing with {replacement}")
686
- restored_text = re.sub(pattern, replacement, restored_text)
687
-
688
- # Final safety check: Look for any remaining placeholder-like patterns
689
- remaining_underscores = re.findall(r'_{2,}', restored_text)
690
- if remaining_underscores:
691
- print(f"Warning: Found remaining underscore patterns: {remaining_underscores}")
692
- # If we still have multiple underscores and we have keywords, do a simple replacement
693
- # This is aggressive but necessary when model completely mangles placeholders
694
- if '___' in restored_text and keyword_map:
695
- # Replace the first occurrence of multiple underscores with each keyword
696
- for placeholder, keyword in keyword_map.items():
697
- if '___' in restored_text:
698
- restored_text = restored_text.replace('___', keyword, 1)
699
 
700
  # Log final result
701
  print(f"Final restored text: {restored_text[:100]}...")
@@ -873,9 +864,13 @@ class EnhancedDipperHumanizer:
873
  elif text.lower().startswith('rewrite:'):
874
  text = text[8:].strip()
875
 
 
 
 
 
876
  # Remove leading non-letter characters carefully
877
  # IMPORTANT: Preserve keyword placeholders
878
- if not re.match(r'^__KW\d+__', text):
879
  # Only remove if it doesn't start with a placeholder
880
  text = re.sub(r'^[^a-zA-Z_]+', '', text)
881
 
 
603
  print(f"Restoring keywords in text: {restored_text[:100]}...")
604
  print(f"Keyword map: {keyword_map}")
605
 
606
+ # Track which positions have been replaced to avoid double replacement
607
+ replaced_positions = set()
608
+
609
  # First pass: Direct placeholder replacement
610
  for placeholder, keyword in keyword_map.items():
611
  if placeholder in restored_text:
612
  print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
613
  restored_text = restored_text.replace(placeholder, keyword)
614
+ # Mark positions as replaced
615
+ import re
616
+ for match in re.finditer(re.escape(keyword), restored_text):
617
+ replaced_positions.update(range(match.start(), match.end()))
618
 
619
  # Second pass: Handle any mangled placeholders
620
  # The model might alter placeholders in various ways
 
626
 
627
  # Various patterns the model might create
628
  patterns = [
629
+ (f'__KW{num}__', keyword),
630
+ (f'__ KW{num}__', keyword),
631
+ (f'__KW {num}__', keyword),
632
+ (f'__ KW {num} __', keyword),
633
+ (f'_KW{num}_', keyword),
634
+ (f'_kw{num}_', keyword),
635
+ (f'KW{num}', keyword),
636
+ (f'KW {num}', keyword),
637
+ (f'__kw{num}__', keyword),
638
+ (f'__Kw{num}__', keyword),
639
+ (f'__ kw{num}__', keyword),
640
+ (f'__KW{num}_', keyword),
641
+ (f'_KW{num}__', keyword),
642
+ (f'kw{num}', keyword),
643
+ (f'``KW{num}__', keyword), # Handle backtick corruption
644
+ (f'``KKW{num}', keyword), # Handle double K corruption
645
+ (f'KW{num}', keyword), # Simple pattern
 
 
646
  ]
647
 
648
+ for pattern, replacement in patterns:
649
  if pattern in restored_text:
650
+ # Check if this position has already been replaced
651
+ start_pos = restored_text.find(pattern)
652
+ if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
653
+ print(f"Found pattern '{pattern}', replacing with {replacement}")
654
+ restored_text = restored_text.replace(pattern, replacement, 1) # Replace only first occurrence
655
+ # Mark new positions as replaced
656
+ for match in re.finditer(re.escape(replacement), restored_text):
657
+ replaced_positions.update(range(match.start(), match.end()))
658
+ break # Move to next placeholder after successful replacement
659
+
660
+ # Third pass: Clean up any backticks or quotes that shouldn't be there
661
+ # Remove double backticks
662
+ restored_text = re.sub(r'``+', '', restored_text)
663
+ # Fix double quotes
664
+ restored_text = re.sub(r"''", '"', restored_text)
665
+ restored_text = re.sub(r'""', '"', restored_text)
666
+
667
+ # Fourth pass: Look for remaining underscore patterns
668
+ # But be more careful about replacement
669
+ if '___' in restored_text and keyword_map:
670
+ # Find all occurrences of multiple underscores
671
+ underscore_matches = list(re.finditer(r'_{3,}', restored_text))
672
+ keyword_values = list(keyword_map.values())
673
+
674
+ # Replace underscores with keywords, but only if not already replaced
675
+ for i, match in enumerate(underscore_matches):
676
+ if i < len(keyword_values):
677
+ start, end = match.span()
678
+ if not any(pos in replaced_positions for pos in range(start, end)):
679
+ before = restored_text[:start]
680
+ after = restored_text[end:]
681
+ restored_text = before + keyword_values[i] + after
682
+ # Update replaced positions
683
+ replaced_positions.update(range(start, start + len(keyword_values[i])))
684
+
685
+ # Final cleanup: Remove any remaining KW patterns that weren't caught
686
+ # But only if they're not part of an already replaced keyword
687
+ remaining_kw_patterns = re.findall(r'\bKW\d{3}\b', restored_text)
688
+ if remaining_kw_patterns:
689
+ print(f"Warning: Found remaining KW patterns: {remaining_kw_patterns}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
 
691
  # Log final result
692
  print(f"Final restored text: {restored_text[:100]}...")
 
864
  elif text.lower().startswith('rewrite:'):
865
  text = text[8:].strip()
866
 
867
+ # Clean up backticks that sometimes appear
868
+ text = re.sub(r'``+', '', text)
869
+ text = re.sub(r"''", '"', text)
870
+
871
  # Remove leading non-letter characters carefully
872
  # IMPORTANT: Preserve keyword placeholders
873
+ if not re.match(r'^(__KW\d+__|KW\d+)', text):
874
  # Only remove if it doesn't start with a placeholder
875
  text = re.sub(r'^[^a-zA-Z_]+', '', text)
876