Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -663,150 +663,122 @@ class EnhancedDipperHumanizer:
|
|
| 663 |
return text
|
| 664 |
|
| 665 |
def should_skip_element(self, element, text):
|
| 666 |
-
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
return True
|
| 669 |
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
for attr_name, attr_value in parent.attrs.items():
|
| 678 |
-
if attr_name.startswith('on') and 'selectOption' in str(attr_value):
|
| 679 |
-
return True
|
| 680 |
|
| 681 |
-
#
|
| 682 |
-
|
| 683 |
-
|
|
|
|
|
|
|
| 684 |
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
|
|
|
|
|
|
|
|
|
| 696 |
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
is_in_table = any(p.name == 'table' for p in parent.parents)
|
| 702 |
-
if is_in_table:
|
| 703 |
-
# If we're in a table, skip any text that's inside formatting tags
|
| 704 |
-
if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
|
| 705 |
-
return True
|
| 706 |
-
# Also check if parent's parent is a formatting tag
|
| 707 |
-
if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 708 |
-
return True
|
| 709 |
-
|
| 710 |
-
# Skip table of contents
|
| 711 |
-
if parent:
|
| 712 |
-
parent_text = str(parent).lower()
|
| 713 |
-
if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
|
| 714 |
-
return True
|
| 715 |
-
|
| 716 |
-
# Skip CTAs and buttons
|
| 717 |
-
if parent and parent.name in ['button', 'a']:
|
| 718 |
-
return True
|
| 719 |
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
ancestors_to_check = []
|
| 729 |
-
current = parent
|
| 730 |
-
for _ in range(3): # Check up to 3 levels up
|
| 731 |
-
if current:
|
| 732 |
-
ancestors_to_check.append(current)
|
| 733 |
-
current = current.parent
|
| 734 |
-
|
| 735 |
-
# Check if any ancestor has testimonial-card class
|
| 736 |
-
for ancestor in ancestors_to_check:
|
| 737 |
-
if ancestor and ancestor.get('class'):
|
| 738 |
-
classes = ancestor.get('class', [])
|
| 739 |
-
if isinstance(classes, list):
|
| 740 |
-
if any('testimonial-card' in str(cls) for cls in classes):
|
| 741 |
-
return True
|
| 742 |
-
elif isinstance(classes, str) and 'testimonial-card' in classes:
|
| 743 |
-
return True
|
| 744 |
-
|
| 745 |
-
# Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
|
| 746 |
-
skip_indicators = [
|
| 747 |
-
'button', 'btn', 'heading', 'title', 'caption',
|
| 748 |
-
'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
|
| 749 |
-
'warning', 'info', 'success', 'error', 'code', 'pre',
|
| 750 |
-
'stats-grid', 'testimonial-card',
|
| 751 |
-
'cta-box', 'quiz-container', 'contact-form',
|
| 752 |
-
'faq-question', 'sidebar', 'widget', 'banner',
|
| 753 |
-
'author-intro', 'testimonial', 'review', 'feedback',
|
| 754 |
-
'floating-', 'stat-', 'progress-', 'option', 'results',
|
| 755 |
-
'question-container', 'quiz-',
|
| 756 |
-
'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
|
| 757 |
-
]
|
| 758 |
|
| 759 |
-
#
|
| 760 |
-
|
| 761 |
-
'results', 'progressFill', 'currentQuestion', 'totalQuestions']
|
| 762 |
|
| 763 |
-
#
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
return True
|
| 780 |
-
|
| 781 |
-
# Check element's ID
|
| 782 |
-
elem_id = elem.get('id', '')
|
| 783 |
-
if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
|
| 784 |
-
return True
|
| 785 |
-
if elem_id in quiz_indicators:
|
| 786 |
-
return True
|
| 787 |
-
|
| 788 |
-
# Skip short phrases that might be UI elements
|
| 789 |
-
word_count = len(text.split())
|
| 790 |
-
if word_count <= 5:
|
| 791 |
-
ui_patterns = [
|
| 792 |
-
'click', 'download', 'learn more', 'read more', 'sign up',
|
| 793 |
-
'get started', 'try now', 'buy now', 'next', 'previous',
|
| 794 |
-
'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
|
| 795 |
-
'check out:', 'see also:', 'related:', 'question', 'of'
|
| 796 |
-
]
|
| 797 |
-
if any(pattern in text.lower() for pattern in ui_patterns):
|
| 798 |
-
return True
|
| 799 |
-
|
| 800 |
-
# Skip very short content in styled containers
|
| 801 |
-
if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
|
| 802 |
-
style = parent.get('style', '')
|
| 803 |
-
if 'border' in style or 'background' in style:
|
| 804 |
-
if word_count <= 20:
|
| 805 |
-
# But don't skip if it's inside a paragraph
|
| 806 |
-
if not any(p.name == 'p' for p in parent.parents):
|
| 807 |
-
return True
|
| 808 |
-
|
| 809 |
-
return False
|
| 810 |
|
| 811 |
def is_likely_acronym_or_proper_noun(self, word):
|
| 812 |
"""Check if a word is likely an acronym or part of a proper noun"""
|
|
@@ -1350,25 +1322,12 @@ class EnhancedDipperHumanizer:
|
|
| 1350 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 1351 |
text_elements = []
|
| 1352 |
|
| 1353 |
-
# Get all elements with onclick, onchange, etc.
|
| 1354 |
-
elements_with_handlers = soup.find_all(attrs=lambda x: x and any(k.startswith('on') for k in x.keys()))
|
| 1355 |
-
|
| 1356 |
-
# Mark these elements to skip their text content
|
| 1357 |
-
skip_elements = set()
|
| 1358 |
-
for elem in elements_with_handlers:
|
| 1359 |
-
skip_elements.add(elem)
|
| 1360 |
-
skip_elements.update(elem.descendants)
|
| 1361 |
-
|
| 1362 |
# Get all text nodes using string instead of text (fixing deprecation)
|
| 1363 |
for element in soup.find_all(string=True):
|
| 1364 |
# Skip script, style, and noscript content completely
|
| 1365 |
if element.parent.name in ['script', 'style', 'noscript']:
|
| 1366 |
continue
|
| 1367 |
|
| 1368 |
-
# Skip if element or any parent has event handlers
|
| 1369 |
-
if any(parent in skip_elements for parent in element.parents):
|
| 1370 |
-
continue
|
| 1371 |
-
|
| 1372 |
text = element.strip()
|
| 1373 |
if text and not self.should_skip_element(element, text):
|
| 1374 |
text_elements.append({
|
|
@@ -1379,21 +1338,42 @@ class EnhancedDipperHumanizer:
|
|
| 1379 |
return soup, text_elements
|
| 1380 |
|
| 1381 |
def validate_and_fix_html(self, html_text):
|
| 1382 |
-
|
| 1383 |
-
|
| 1384 |
-
|
| 1385 |
-
|
| 1386 |
-
|
| 1387 |
-
|
| 1388 |
-
|
| 1389 |
-
|
| 1390 |
-
|
| 1391 |
-
|
| 1392 |
-
|
| 1393 |
-
|
| 1394 |
-
|
| 1395 |
-
|
| 1396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1397 |
|
| 1398 |
def add_natural_flow_variations(self, text):
|
| 1399 |
"""Add more natural flow and rhythm variations for Originality AI"""
|
|
@@ -1468,23 +1448,23 @@ class EnhancedDipperHumanizer:
|
|
| 1468 |
preserved_scripts = []
|
| 1469 |
preserved_styles = []
|
| 1470 |
|
| 1471 |
-
#
|
| 1472 |
-
|
| 1473 |
|
| 1474 |
-
#
|
| 1475 |
-
for idx, script in enumerate(
|
| 1476 |
-
placeholder =
|
| 1477 |
preserved_scripts.append(str(script))
|
| 1478 |
script.replace_with(placeholder)
|
| 1479 |
|
| 1480 |
# Preserve all style tags
|
| 1481 |
-
for idx, style in enumerate(
|
| 1482 |
-
placeholder =
|
| 1483 |
preserved_styles.append(str(style))
|
| 1484 |
style.replace_with(placeholder)
|
| 1485 |
|
| 1486 |
# Get the modified HTML
|
| 1487 |
-
html_content = str(
|
| 1488 |
|
| 1489 |
try:
|
| 1490 |
# Extract text elements
|
|
@@ -1548,12 +1528,12 @@ class EnhancedDipperHumanizer:
|
|
| 1548 |
# Restore all script tags
|
| 1549 |
for idx, script_content in enumerate(preserved_scripts):
|
| 1550 |
placeholder = script_placeholder.format(idx)
|
| 1551 |
-
result = result.replace(
|
| 1552 |
|
| 1553 |
# Restore all style tags
|
| 1554 |
for idx, style_content in enumerate(preserved_styles):
|
| 1555 |
placeholder = style_placeholder.format(idx)
|
| 1556 |
-
result = result.replace(
|
| 1557 |
|
| 1558 |
# Post-process the entire HTML to fix bold/strong formatting
|
| 1559 |
result = self.post_process_html(result)
|
|
|
|
| 663 |
return text
|
| 664 |
|
| 665 |
def should_skip_element(self, element, text):
|
| 666 |
+
"""Determine if an element should be skipped from paraphrasing"""
|
| 667 |
+
if not text or len(text.strip()) < 3:
|
| 668 |
+
return True
|
| 669 |
+
|
| 670 |
+
# Skip JavaScript code inside script tags - CRITICAL FIX
|
| 671 |
+
parent = element.parent
|
| 672 |
+
if parent and parent.name in ['script', 'style', 'noscript']:
|
| 673 |
+
return True
|
| 674 |
+
|
| 675 |
+
# Also check if we're inside a script tag at any level
|
| 676 |
+
for ancestor in element.parents:
|
| 677 |
+
if ancestor.name in ['script', 'style', 'noscript']:
|
| 678 |
return True
|
| 679 |
|
| 680 |
+
# Rest of your existing skip logic...
|
| 681 |
+
return False
|
| 682 |
+
|
| 683 |
+
def extract_text_from_html(self, html_content):
|
| 684 |
+
"""Extract text elements from HTML with skip logic"""
|
| 685 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 686 |
+
text_elements = []
|
| 687 |
+
|
| 688 |
+
# CRITICAL: Preserve all script tags completely
|
| 689 |
+
script_tags = soup.find_all('script')
|
| 690 |
+
script_placeholders = {}
|
| 691 |
+
|
| 692 |
+
for i, script in enumerate(script_tags):
|
| 693 |
+
placeholder = f"###SCRIPT_CONTENT_{i}###"
|
| 694 |
+
script_placeholders[placeholder] = str(script)
|
| 695 |
+
script.string = placeholder
|
| 696 |
+
|
| 697 |
+
# Get all text nodes
|
| 698 |
+
for element in soup.find_all(string=True):
|
| 699 |
+
# Skip script, style, and noscript content completely
|
| 700 |
+
if element.parent.name in ['script', 'style', 'noscript']:
|
| 701 |
+
continue
|
| 702 |
+
|
| 703 |
+
# Skip if it's a script placeholder
|
| 704 |
+
text = element.strip()
|
| 705 |
+
if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
|
| 706 |
+
continue
|
| 707 |
+
|
| 708 |
+
if text and not self.should_skip_element(element, text):
|
| 709 |
+
text_elements.append({
|
| 710 |
+
'text': text,
|
| 711 |
+
'element': element
|
| 712 |
+
})
|
| 713 |
+
|
| 714 |
+
return soup, text_elements, script_placeholders
|
| 715 |
+
|
| 716 |
+
def process_html(self, html_content, progress_callback=None):
|
| 717 |
+
"""Main processing function with progress callback"""
|
| 718 |
+
if not html_content.strip():
|
| 719 |
+
return "Please provide HTML content."
|
| 720 |
+
|
| 721 |
+
try:
|
| 722 |
+
# Extract text elements with script preservation
|
| 723 |
+
soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
|
| 724 |
|
| 725 |
+
total_elements = len(text_elements)
|
| 726 |
+
print(f"Found {total_elements} text elements to process (after filtering)")
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
+
# Process each text element
|
| 729 |
+
processed_count = 0
|
| 730 |
+
|
| 731 |
+
for i, element_info in enumerate(text_elements):
|
| 732 |
+
original_text = element_info['text']
|
| 733 |
|
| 734 |
+
# Skip placeholders
|
| 735 |
+
if "###SCRIPT_" in original_text:
|
| 736 |
+
continue
|
| 737 |
|
| 738 |
+
# Skip very short texts
|
| 739 |
+
if len(original_text.split()) < 3:
|
| 740 |
+
continue
|
| 741 |
|
| 742 |
+
# Process the text with your existing logic
|
| 743 |
+
paraphrased_text = self.paraphrase_with_dipper(
|
| 744 |
+
original_text,
|
| 745 |
+
lex_diversity=60,
|
| 746 |
+
order_diversity=20
|
| 747 |
+
)
|
| 748 |
|
| 749 |
+
# Apply other transformations...
|
| 750 |
+
paraphrased_text = self.apply_sentence_variation(paraphrased_text)
|
| 751 |
+
paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
|
| 752 |
+
paraphrased_text = self.fix_punctuation(paraphrased_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
|
| 754 |
+
# Final quality check
|
| 755 |
+
if paraphrased_text and len(paraphrased_text.split()) >= 3:
|
| 756 |
+
element_info['element'].replace_with(NavigableString(paraphrased_text))
|
| 757 |
+
processed_count += 1
|
| 758 |
+
|
| 759 |
+
# Progress update
|
| 760 |
+
if progress_callback:
|
| 761 |
+
progress_callback(i + 1, total_elements)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 762 |
|
| 763 |
+
# Get the processed HTML
|
| 764 |
+
result_html = str(soup)
|
|
|
|
| 765 |
|
| 766 |
+
# CRITICAL: Restore all script content exactly as it was
|
| 767 |
+
for placeholder, original_script in script_placeholders.items():
|
| 768 |
+
result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
|
| 769 |
+
|
| 770 |
+
# Post-process the entire HTML
|
| 771 |
+
result_html = self.post_process_html(result_html)
|
| 772 |
+
result_html = self.validate_and_fix_html(result_html)
|
| 773 |
+
|
| 774 |
+
print(f"Successfully processed {processed_count} text elements")
|
| 775 |
+
return result_html
|
| 776 |
+
|
| 777 |
+
except Exception as e:
|
| 778 |
+
import traceback
|
| 779 |
+
error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
|
| 780 |
+
print(error_msg)
|
| 781 |
+
return f"<!-- {error_msg} -->\n{html_content}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
|
| 783 |
def is_likely_acronym_or_proper_noun(self, word):
|
| 784 |
"""Check if a word is likely an acronym or part of a proper noun"""
|
|
|
|
| 1322 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 1323 |
text_elements = []
|
| 1324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1325 |
# Get all text nodes using string instead of text (fixing deprecation)
|
| 1326 |
for element in soup.find_all(string=True):
|
| 1327 |
# Skip script, style, and noscript content completely
|
| 1328 |
if element.parent.name in ['script', 'style', 'noscript']:
|
| 1329 |
continue
|
| 1330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1331 |
text = element.strip()
|
| 1332 |
if text and not self.should_skip_element(element, text):
|
| 1333 |
text_elements.append({
|
|
|
|
| 1338 |
return soup, text_elements
|
| 1339 |
|
| 1340 |
def validate_and_fix_html(self, html_text):
|
| 1341 |
+
"""Fix common HTML syntax errors after processing"""
|
| 1342 |
+
|
| 1343 |
+
# First, protect script content
|
| 1344 |
+
script_pattern = r'<script[^>]*>(.*?)</script>'
|
| 1345 |
+
scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
|
| 1346 |
+
script_placeholders = {}
|
| 1347 |
+
|
| 1348 |
+
for i, script_content in enumerate(scripts):
|
| 1349 |
+
placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
|
| 1350 |
+
script_placeholders[placeholder] = script_content
|
| 1351 |
+
html_text = html_text.replace(
|
| 1352 |
+
f'<script>{script_content}</script>',
|
| 1353 |
+
f'<script>{placeholder}</script>',
|
| 1354 |
+
1
|
| 1355 |
+
)
|
| 1356 |
+
|
| 1357 |
+
# Fix DOCTYPE
|
| 1358 |
+
html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
|
| 1359 |
+
|
| 1360 |
+
# Fix spacing issues (but not inside scripts)
|
| 1361 |
+
html_text = re.sub(r'>\s+<', '><', html_text)
|
| 1362 |
+
html_text = re.sub(r'\s+>', '>', html_text)
|
| 1363 |
+
html_text = re.sub(r'<\s+', '<', html_text)
|
| 1364 |
+
|
| 1365 |
+
# Fix common word errors that might occur during processing
|
| 1366 |
+
html_text = html_text.replace('down loaded', 'downloaded')
|
| 1367 |
+
html_text = html_text.replace('But your document', 'Your document')
|
| 1368 |
+
|
| 1369 |
+
# Restore script content
|
| 1370 |
+
for placeholder, script_content in script_placeholders.items():
|
| 1371 |
+
html_text = html_text.replace(
|
| 1372 |
+
f'<script>{placeholder}</script>',
|
| 1373 |
+
f'<script>{script_content}</script>'
|
| 1374 |
+
)
|
| 1375 |
+
|
| 1376 |
+
return html_text
|
| 1377 |
|
| 1378 |
def add_natural_flow_variations(self, text):
|
| 1379 |
"""Add more natural flow and rhythm variations for Originality AI"""
|
|
|
|
| 1448 |
preserved_scripts = []
|
| 1449 |
preserved_styles = []
|
| 1450 |
|
| 1451 |
+
# Temporarily replace script and style tags with placeholders
|
| 1452 |
+
soup_temp = BeautifulSoup(html_content, 'html.parser')
|
| 1453 |
|
| 1454 |
+
# Preserve all script tags
|
| 1455 |
+
for idx, script in enumerate(soup_temp.find_all('script')):
|
| 1456 |
+
placeholder = script_placeholder.format(idx)
|
| 1457 |
preserved_scripts.append(str(script))
|
| 1458 |
script.replace_with(placeholder)
|
| 1459 |
|
| 1460 |
# Preserve all style tags
|
| 1461 |
+
for idx, style in enumerate(soup_temp.find_all('style')):
|
| 1462 |
+
placeholder = style_placeholder.format(idx)
|
| 1463 |
preserved_styles.append(str(style))
|
| 1464 |
style.replace_with(placeholder)
|
| 1465 |
|
| 1466 |
# Get the modified HTML
|
| 1467 |
+
html_content = str(soup_temp)
|
| 1468 |
|
| 1469 |
try:
|
| 1470 |
# Extract text elements
|
|
|
|
| 1528 |
# Restore all script tags
|
| 1529 |
for idx, script_content in enumerate(preserved_scripts):
|
| 1530 |
placeholder = script_placeholder.format(idx)
|
| 1531 |
+
result = result.replace(placeholder, script_content)
|
| 1532 |
|
| 1533 |
# Restore all style tags
|
| 1534 |
for idx, style_content in enumerate(preserved_styles):
|
| 1535 |
placeholder = style_placeholder.format(idx)
|
| 1536 |
+
result = result.replace(placeholder, style_content)
|
| 1537 |
|
| 1538 |
# Post-process the entire HTML to fix bold/strong formatting
|
| 1539 |
result = self.post_process_html(result)
|