Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -603,11 +603,18 @@ class EnhancedDipperHumanizer:
|
|
| 603 |
print(f"Restoring keywords in text: {restored_text[:100]}...")
|
| 604 |
print(f"Keyword map: {keyword_map}")
|
| 605 |
|
|
|
|
|
|
|
|
|
|
| 606 |
# First pass: Direct placeholder replacement
|
| 607 |
for placeholder, keyword in keyword_map.items():
|
| 608 |
if placeholder in restored_text:
|
| 609 |
print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
|
| 610 |
restored_text = restored_text.replace(placeholder, keyword)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
|
| 612 |
# Second pass: Handle any mangled placeholders
|
| 613 |
# The model might alter placeholders in various ways
|
|
@@ -619,83 +626,67 @@ class EnhancedDipperHumanizer:
|
|
| 619 |
|
| 620 |
# Various patterns the model might create
|
| 621 |
patterns = [
|
| 622 |
-
f'__KW{num}__',
|
| 623 |
-
f'__ KW{num}__',
|
| 624 |
-
f'__KW {num}__',
|
| 625 |
-
f'__ KW {num} __',
|
| 626 |
-
f'_KW{num}_',
|
| 627 |
-
f'_kw{num}_',
|
| 628 |
-
f'KW{num}',
|
| 629 |
-
f'KW {num}',
|
| 630 |
-
f'__kw{num}__',
|
| 631 |
-
f'__Kw{num}__',
|
| 632 |
-
f'__ kw{num}__',
|
| 633 |
-
f'__KW{num}_',
|
| 634 |
-
f'_KW{num}__',
|
| 635 |
-
f'kw{num}',
|
| 636 |
-
f'
|
| 637 |
-
f'
|
| 638 |
-
f'
|
| 639 |
-
f'__ __',
|
| 640 |
-
f'___ ___',
|
| 641 |
]
|
| 642 |
|
| 643 |
-
for pattern in patterns:
|
| 644 |
if pattern in restored_text:
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
print(f"Found underscore pattern '{pattern}', replacing with {replacement}")
|
| 686 |
-
restored_text = re.sub(pattern, replacement, restored_text)
|
| 687 |
-
|
| 688 |
-
# Final safety check: Look for any remaining placeholder-like patterns
|
| 689 |
-
remaining_underscores = re.findall(r'_{2,}', restored_text)
|
| 690 |
-
if remaining_underscores:
|
| 691 |
-
print(f"Warning: Found remaining underscore patterns: {remaining_underscores}")
|
| 692 |
-
# If we still have multiple underscores and we have keywords, do a simple replacement
|
| 693 |
-
# This is aggressive but necessary when model completely mangles placeholders
|
| 694 |
-
if '___' in restored_text and keyword_map:
|
| 695 |
-
# Replace the first occurrence of multiple underscores with each keyword
|
| 696 |
-
for placeholder, keyword in keyword_map.items():
|
| 697 |
-
if '___' in restored_text:
|
| 698 |
-
restored_text = restored_text.replace('___', keyword, 1)
|
| 699 |
|
| 700 |
# Log final result
|
| 701 |
print(f"Final restored text: {restored_text[:100]}...")
|
|
@@ -873,9 +864,13 @@ class EnhancedDipperHumanizer:
|
|
| 873 |
elif text.lower().startswith('rewrite:'):
|
| 874 |
text = text[8:].strip()
|
| 875 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
# Remove leading non-letter characters carefully
|
| 877 |
# IMPORTANT: Preserve keyword placeholders
|
| 878 |
-
if not re.match(r'^__KW\d+__', text):
|
| 879 |
# Only remove if it doesn't start with a placeholder
|
| 880 |
text = re.sub(r'^[^a-zA-Z_]+', '', text)
|
| 881 |
|
|
|
|
| 603 |
print(f"Restoring keywords in text: {restored_text[:100]}...")
|
| 604 |
print(f"Keyword map: {keyword_map}")
|
| 605 |
|
| 606 |
+
# Track which positions have been replaced to avoid double replacement
|
| 607 |
+
replaced_positions = set()
|
| 608 |
+
|
| 609 |
# First pass: Direct placeholder replacement
|
| 610 |
for placeholder, keyword in keyword_map.items():
|
| 611 |
if placeholder in restored_text:
|
| 612 |
print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
|
| 613 |
restored_text = restored_text.replace(placeholder, keyword)
|
| 614 |
+
# Mark positions as replaced
|
| 615 |
+
import re
|
| 616 |
+
for match in re.finditer(re.escape(keyword), restored_text):
|
| 617 |
+
replaced_positions.update(range(match.start(), match.end()))
|
| 618 |
|
| 619 |
# Second pass: Handle any mangled placeholders
|
| 620 |
# The model might alter placeholders in various ways
|
|
|
|
| 626 |
|
| 627 |
# Various patterns the model might create
|
| 628 |
patterns = [
|
| 629 |
+
(f'__KW{num}__', keyword),
|
| 630 |
+
(f'__ KW{num}__', keyword),
|
| 631 |
+
(f'__KW {num}__', keyword),
|
| 632 |
+
(f'__ KW {num} __', keyword),
|
| 633 |
+
(f'_KW{num}_', keyword),
|
| 634 |
+
(f'_kw{num}_', keyword),
|
| 635 |
+
(f'KW{num}', keyword),
|
| 636 |
+
(f'KW {num}', keyword),
|
| 637 |
+
(f'__kw{num}__', keyword),
|
| 638 |
+
(f'__Kw{num}__', keyword),
|
| 639 |
+
(f'__ kw{num}__', keyword),
|
| 640 |
+
(f'__KW{num}_', keyword),
|
| 641 |
+
(f'_KW{num}__', keyword),
|
| 642 |
+
(f'kw{num}', keyword),
|
| 643 |
+
(f'``KW{num}__', keyword), # Handle backtick corruption
|
| 644 |
+
(f'``KKW{num}', keyword), # Handle double K corruption
|
| 645 |
+
(f'KW{num}', keyword), # Simple pattern
|
|
|
|
|
|
|
| 646 |
]
|
| 647 |
|
| 648 |
+
for pattern, replacement in patterns:
|
| 649 |
if pattern in restored_text:
|
| 650 |
+
# Check if this position has already been replaced
|
| 651 |
+
start_pos = restored_text.find(pattern)
|
| 652 |
+
if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
|
| 653 |
+
print(f"Found pattern '{pattern}', replacing with {replacement}")
|
| 654 |
+
restored_text = restored_text.replace(pattern, replacement, 1) # Replace only first occurrence
|
| 655 |
+
# Mark new positions as replaced
|
| 656 |
+
for match in re.finditer(re.escape(replacement), restored_text):
|
| 657 |
+
replaced_positions.update(range(match.start(), match.end()))
|
| 658 |
+
break # Move to next placeholder after successful replacement
|
| 659 |
+
|
| 660 |
+
# Third pass: Clean up any backticks or quotes that shouldn't be there
|
| 661 |
+
# Remove double backticks
|
| 662 |
+
restored_text = re.sub(r'``+', '', restored_text)
|
| 663 |
+
# Fix double quotes
|
| 664 |
+
restored_text = re.sub(r"''", '"', restored_text)
|
| 665 |
+
restored_text = re.sub(r'""', '"', restored_text)
|
| 666 |
+
|
| 667 |
+
# Fourth pass: Look for remaining underscore patterns
|
| 668 |
+
# But be more careful about replacement
|
| 669 |
+
if '___' in restored_text and keyword_map:
|
| 670 |
+
# Find all occurrences of multiple underscores
|
| 671 |
+
underscore_matches = list(re.finditer(r'_{3,}', restored_text))
|
| 672 |
+
keyword_values = list(keyword_map.values())
|
| 673 |
+
|
| 674 |
+
# Replace underscores with keywords, but only if not already replaced
|
| 675 |
+
for i, match in enumerate(underscore_matches):
|
| 676 |
+
if i < len(keyword_values):
|
| 677 |
+
start, end = match.span()
|
| 678 |
+
if not any(pos in replaced_positions for pos in range(start, end)):
|
| 679 |
+
before = restored_text[:start]
|
| 680 |
+
after = restored_text[end:]
|
| 681 |
+
restored_text = before + keyword_values[i] + after
|
| 682 |
+
# Update replaced positions
|
| 683 |
+
replaced_positions.update(range(start, start + len(keyword_values[i])))
|
| 684 |
+
|
| 685 |
+
# Final cleanup: Remove any remaining KW patterns that weren't caught
|
| 686 |
+
# But only if they're not part of an already replaced keyword
|
| 687 |
+
remaining_kw_patterns = re.findall(r'\bKW\d{3}\b', restored_text)
|
| 688 |
+
if remaining_kw_patterns:
|
| 689 |
+
print(f"Warning: Found remaining KW patterns: {remaining_kw_patterns}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
|
| 691 |
# Log final result
|
| 692 |
print(f"Final restored text: {restored_text[:100]}...")
|
|
|
|
| 864 |
elif text.lower().startswith('rewrite:'):
|
| 865 |
text = text[8:].strip()
|
| 866 |
|
| 867 |
+
# Clean up backticks that sometimes appear
|
| 868 |
+
text = re.sub(r'``+', '', text)
|
| 869 |
+
text = re.sub(r"''", '"', text)
|
| 870 |
+
|
| 871 |
# Remove leading non-letter characters carefully
|
| 872 |
# IMPORTANT: Preserve keyword placeholders
|
| 873 |
+
if not re.match(r'^(__KW\d+__|KW\d+)', text):
|
| 874 |
# Only remove if it doesn't start with a placeholder
|
| 875 |
text = re.sub(r'^[^a-zA-Z_]+', '', text)
|
| 876 |
|