Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Jul 25

Commit

5e5997d

verified ·

1 Parent(s): 17c0697

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -75

app.py CHANGED Viewed

@@ -603,11 +603,18 @@ class EnhancedDipperHumanizer:
         print(f"Restoring keywords in text: {restored_text[:100]}...")
         print(f"Keyword map: {keyword_map}")
         # First pass: Direct placeholder replacement
         for placeholder, keyword in keyword_map.items():
             if placeholder in restored_text:
                 print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
                 restored_text = restored_text.replace(placeholder, keyword)
         # Second pass: Handle any mangled placeholders
         # The model might alter placeholders in various ways
@@ -619,83 +626,67 @@ class EnhancedDipperHumanizer:
                 # Various patterns the model might create
                 patterns = [
-                    f'__KW{num}__',
-                    f'__ KW{num}__',
-                    f'__KW {num}__',
-                    f'__ KW {num} __',
-                    f'_KW{num}_',
-                    f'_kw{num}_',  # lowercase with single underscore
-                    f'KW{num}',
-                    f'KW {num}',
-                    f'__kw{num}__',  # lowercase variant
-                    f'__Kw{num}__',  # mixed case
-                    f'__ kw{num}__',
-                    f'__KW{num}_',   # missing underscore
-                    f'_KW{num}__',   # missing underscore
-                    f'kw{num}',      # just lowercase
-                    f'___',          # Sometimes model reduces to just underscores
-                    f'____',         # Various underscore patterns
-                    f'_____',
-                    f'__ __',
-                    f'___ ___',
                 ]
-                for pattern in patterns:
                     if pattern in restored_text:
-                        print(f"Found pattern '{pattern}', replacing with {keyword}")
-                        restored_text = restored_text.replace(pattern, keyword)
-        # Third pass: Use regex to catch any remaining variations
-        # This catches cases where the model might have added characters
-        for placeholder, keyword in keyword_map.items():
-            match = re.search(r'__KW(\d+)__', placeholder)
-            if match:
-                num = match.group(1)
-                # Regex to match various mangled versions including single underscore
-                regex_patterns = [
-                    rf'_+\s*[Kk][Ww]\s*{num}\s*_*',  # Any underscores, case insensitive
-                    rf'[Kk][Ww]\s*{num}(?!\d)',       # KW followed by the number
-                    rf'__?\s*[Kk][Ww]\s*{num}\s*__?', # Optional underscores
-                    rf'_[Kk][Ww]{num}_',              # Single underscore version
-                    rf'_+\s*{num}\s*_*',              # Just the number with underscores
-                    rf'__+',                          # Multiple underscores (fallback)
-                ]
-                for pattern in regex_patterns:
-                    matches = list(re.finditer(pattern, restored_text, flags=re.IGNORECASE))
-                    if matches:
-                        print(f"Found regex pattern '{pattern}' {len(matches)} times")
-                        # Replace from end to beginning to maintain positions
-                        for match in reversed(matches):
-                            restored_text = restored_text[:match.start()] + keyword + restored_text[match.end():]
-        # Fourth pass: Look for common patterns where model mangles placeholders
-        # Sometimes the model turns __KW002__ into things like "___ University" or "___ College__"
-        underscore_patterns = [
-            (r'___+\s*[Uu]niversity', keyword + ' University') if 'universit' in keyword.lower() else None,
-            (r'___+\s*[Cc]ollege__?', keyword + ' College') if 'college' in keyword.lower() else None,
-            (r'___+\s*[Ss]chool', keyword + ' School') if 'school' in keyword.lower() else None,
-            (r'___+', keyword),  # Generic underscore replacement
-        ]
-        for pattern_tuple in underscore_patterns:
-            if pattern_tuple:
-                pattern, replacement = pattern_tuple
-                if re.search(pattern, restored_text):
-                    print(f"Found underscore pattern '{pattern}', replacing with {replacement}")
-                    restored_text = re.sub(pattern, replacement, restored_text)
-        # Final safety check: Look for any remaining placeholder-like patterns
-        remaining_underscores = re.findall(r'_{2,}', restored_text)
-        if remaining_underscores:
-            print(f"Warning: Found remaining underscore patterns: {remaining_underscores}")
-            # If we still have multiple underscores and we have keywords, do a simple replacement
-            # This is aggressive but necessary when model completely mangles placeholders
-            if '___' in restored_text and keyword_map:
-                # Replace the first occurrence of multiple underscores with each keyword
-                for placeholder, keyword in keyword_map.items():
-                    if '___' in restored_text:
-                        restored_text = restored_text.replace('___', keyword, 1)
         # Log final result
         print(f"Final restored text: {restored_text[:100]}...")
@@ -873,9 +864,13 @@ class EnhancedDipperHumanizer:
         elif text.lower().startswith('rewrite:'):
             text = text[8:].strip()
         # Remove leading non-letter characters carefully
         # IMPORTANT: Preserve keyword placeholders
-        if not re.match(r'^__KW\d+__', text):
             # Only remove if it doesn't start with a placeholder
             text = re.sub(r'^[^a-zA-Z_]+', '', text)

         print(f"Restoring keywords in text: {restored_text[:100]}...")
         print(f"Keyword map: {keyword_map}")
+        # Track which positions have been replaced to avoid double replacement
+        replaced_positions = set()
         # First pass: Direct placeholder replacement
         for placeholder, keyword in keyword_map.items():
             if placeholder in restored_text:
                 print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
                 restored_text = restored_text.replace(placeholder, keyword)
+                # Mark positions as replaced
+                import re
+                for match in re.finditer(re.escape(keyword), restored_text):
+                    replaced_positions.update(range(match.start(), match.end()))
         # Second pass: Handle any mangled placeholders
         # The model might alter placeholders in various ways
                 # Various patterns the model might create
                 patterns = [
+                    (f'__KW{num}__', keyword),
+                    (f'__ KW{num}__', keyword),
+                    (f'__KW {num}__', keyword),
+                    (f'__ KW {num} __', keyword),
+                    (f'_KW{num}_', keyword),
+                    (f'_kw{num}_', keyword),
+                    (f'KW{num}', keyword),
+                    (f'KW {num}', keyword),
+                    (f'__kw{num}__', keyword),
+                    (f'__Kw{num}__', keyword),
+                    (f'__ kw{num}__', keyword),
+                    (f'__KW{num}_', keyword),
+                    (f'_KW{num}__', keyword),
+                    (f'kw{num}', keyword),
+                    (f'``KW{num}__', keyword),  # Handle backtick corruption
+                    (f'``KKW{num}', keyword),    # Handle double K corruption
+                    (f'KW{num}', keyword),       # Simple pattern
                 ]
+                for pattern, replacement in patterns:
                     if pattern in restored_text:
+                        # Check if this position has already been replaced
+                        start_pos = restored_text.find(pattern)
+                        if start_pos != -1 and not any(pos in replaced_positions for pos in range(start_pos, start_pos + len(pattern))):
+                            print(f"Found pattern '{pattern}', replacing with {replacement}")
+                            restored_text = restored_text.replace(pattern, replacement, 1)  # Replace only first occurrence
+                            # Mark new positions as replaced
+                            for match in re.finditer(re.escape(replacement), restored_text):
+                                replaced_positions.update(range(match.start(), match.end()))
+                            break  # Move to next placeholder after successful replacement
+        # Third pass: Clean up any backticks or quotes that shouldn't be there
+        # Remove double backticks
+        restored_text = re.sub(r'``+', '', restored_text)
+        # Fix double quotes
+        restored_text = re.sub(r"''", '"', restored_text)
+        restored_text = re.sub(r'""', '"', restored_text)
+        # Fourth pass: Look for remaining underscore patterns
+        # But be more careful about replacement
+        if '___' in restored_text and keyword_map:
+            # Find all occurrences of multiple underscores
+            underscore_matches = list(re.finditer(r'_{3,}', restored_text))
+            keyword_values = list(keyword_map.values())
+            # Replace underscores with keywords, but only if not already replaced
+            for i, match in enumerate(underscore_matches):
+                if i < len(keyword_values):
+                    start, end = match.span()
+                    if not any(pos in replaced_positions for pos in range(start, end)):
+                        before = restored_text[:start]
+                        after = restored_text[end:]
+                        restored_text = before + keyword_values[i] + after
+                        # Update replaced positions
+                        replaced_positions.update(range(start, start + len(keyword_values[i])))
+        # Final cleanup: Remove any remaining KW patterns that weren't caught
+        # But only if they're not part of an already replaced keyword
+        remaining_kw_patterns = re.findall(r'\bKW\d{3}\b', restored_text)
+        if remaining_kw_patterns:
+            print(f"Warning: Found remaining KW patterns: {remaining_kw_patterns}")
         # Log final result
         print(f"Final restored text: {restored_text[:100]}...")
         elif text.lower().startswith('rewrite:'):
             text = text[8:].strip()
+        # Clean up backticks that sometimes appear
+        text = re.sub(r'``+', '', text)
+        text = re.sub(r"''", '"', text)
         # Remove leading non-letter characters carefully
         # IMPORTANT: Preserve keyword placeholders
+        if not re.match(r'^(__KW\d+__|KW\d+)', text):
             # Only remove if it doesn't start with a placeholder
             text = re.sub(r'^[^a-zA-Z_]+', '', text)