Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -671,6 +671,16 @@ class EnhancedDipperHumanizer:
|
|
| 671 |
parent = element.parent
|
| 672 |
if parent and parent.name in ['script', 'style', 'noscript']:
|
| 673 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
|
| 675 |
# Skip headings (h1-h6)
|
| 676 |
if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
|
|
@@ -746,6 +756,10 @@ class EnhancedDipperHumanizer:
|
|
| 746 |
'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
|
| 747 |
]
|
| 748 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
# Check only immediate parent and grandparent (not all ancestors)
|
| 750 |
elements_to_check = [parent]
|
| 751 |
if parent and parent.parent:
|
|
@@ -761,11 +775,15 @@ class EnhancedDipperHumanizer:
|
|
| 761 |
class_str = ' '.join(str(cls).lower() for cls in elem_class)
|
| 762 |
if any(indicator in class_str for indicator in skip_indicators):
|
| 763 |
return True
|
|
|
|
|
|
|
| 764 |
|
| 765 |
# Check element's ID
|
| 766 |
elem_id = elem.get('id', '')
|
| 767 |
if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
|
| 768 |
return True
|
|
|
|
|
|
|
| 769 |
|
| 770 |
# Skip short phrases that might be UI elements
|
| 771 |
word_count = len(text.split())
|
|
@@ -1332,12 +1350,25 @@ class EnhancedDipperHumanizer:
|
|
| 1332 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 1333 |
text_elements = []
|
| 1334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1335 |
# Get all text nodes using string instead of text (fixing deprecation)
|
| 1336 |
for element in soup.find_all(string=True):
|
| 1337 |
# Skip script, style, and noscript content completely
|
| 1338 |
if element.parent.name in ['script', 'style', 'noscript']:
|
| 1339 |
continue
|
| 1340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1341 |
text = element.strip()
|
| 1342 |
if text and not self.should_skip_element(element, text):
|
| 1343 |
text_elements.append({
|
|
@@ -1356,7 +1387,7 @@ class EnhancedDipperHumanizer:
|
|
| 1356 |
# Fix spacing issues
|
| 1357 |
html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
|
| 1358 |
html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
|
| 1359 |
-
html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening
|
| 1360 |
|
| 1361 |
# Fix common word errors that might occur during processing
|
| 1362 |
html_text = html_text.replace('down loaded', 'downloaded')
|
|
@@ -1437,23 +1468,23 @@ class EnhancedDipperHumanizer:
|
|
| 1437 |
preserved_scripts = []
|
| 1438 |
preserved_styles = []
|
| 1439 |
|
| 1440 |
-
#
|
| 1441 |
-
|
| 1442 |
|
| 1443 |
-
#
|
| 1444 |
-
for idx, script in enumerate(
|
| 1445 |
-
placeholder = script_placeholder.format(idx)
|
| 1446 |
preserved_scripts.append(str(script))
|
| 1447 |
script.replace_with(placeholder)
|
| 1448 |
|
| 1449 |
# Preserve all style tags
|
| 1450 |
-
for idx, style in enumerate(
|
| 1451 |
-
placeholder = style_placeholder.format(idx)
|
| 1452 |
preserved_styles.append(str(style))
|
| 1453 |
style.replace_with(placeholder)
|
| 1454 |
|
| 1455 |
# Get the modified HTML
|
| 1456 |
-
html_content = str(
|
| 1457 |
|
| 1458 |
try:
|
| 1459 |
# Extract text elements
|
|
@@ -1517,12 +1548,12 @@ class EnhancedDipperHumanizer:
|
|
| 1517 |
# Restore all script tags
|
| 1518 |
for idx, script_content in enumerate(preserved_scripts):
|
| 1519 |
placeholder = script_placeholder.format(idx)
|
| 1520 |
-
result = result.replace(placeholder, script_content)
|
| 1521 |
|
| 1522 |
# Restore all style tags
|
| 1523 |
for idx, style_content in enumerate(preserved_styles):
|
| 1524 |
placeholder = style_placeholder.format(idx)
|
| 1525 |
-
result = result.replace(placeholder, style_content)
|
| 1526 |
|
| 1527 |
# Post-process the entire HTML to fix bold/strong formatting
|
| 1528 |
result = self.post_process_html(result)
|
|
|
|
| 671 |
parent = element.parent
|
| 672 |
if parent and parent.name in ['script', 'style', 'noscript']:
|
| 673 |
return True
|
| 674 |
+
|
| 675 |
+
# Skip inline JavaScript (onclick, onchange, etc.)
|
| 676 |
+
if parent and parent.attrs:
|
| 677 |
+
for attr_name, attr_value in parent.attrs.items():
|
| 678 |
+
if attr_name.startswith('on') and 'selectOption' in str(attr_value):
|
| 679 |
+
return True
|
| 680 |
+
|
| 681 |
+
# Skip if text contains JavaScript function calls
|
| 682 |
+
if 'function' in text or 'selectOption' in text or '=>' in text:
|
| 683 |
+
return True
|
| 684 |
|
| 685 |
# Skip headings (h1-h6)
|
| 686 |
if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
|
|
|
|
| 756 |
'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
|
| 757 |
]
|
| 758 |
|
| 759 |
+
# Skip quiz-related elements
|
| 760 |
+
quiz_indicators = ['quiz-container', 'question-container', 'questionContainer',
|
| 761 |
+
'results', 'progressFill', 'currentQuestion', 'totalQuestions']
|
| 762 |
+
|
| 763 |
# Check only immediate parent and grandparent (not all ancestors)
|
| 764 |
elements_to_check = [parent]
|
| 765 |
if parent and parent.parent:
|
|
|
|
| 775 |
class_str = ' '.join(str(cls).lower() for cls in elem_class)
|
| 776 |
if any(indicator in class_str for indicator in skip_indicators):
|
| 777 |
return True
|
| 778 |
+
if any(indicator in class_str for indicator in quiz_indicators):
|
| 779 |
+
return True
|
| 780 |
|
| 781 |
# Check element's ID
|
| 782 |
elem_id = elem.get('id', '')
|
| 783 |
if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
|
| 784 |
return True
|
| 785 |
+
if elem_id in quiz_indicators:
|
| 786 |
+
return True
|
| 787 |
|
| 788 |
# Skip short phrases that might be UI elements
|
| 789 |
word_count = len(text.split())
|
|
|
|
| 1350 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 1351 |
text_elements = []
|
| 1352 |
|
| 1353 |
+
# Get all elements with onclick, onchange, etc.
|
| 1354 |
+
elements_with_handlers = soup.find_all(attrs=lambda x: x and any(k.startswith('on') for k in x.keys()))
|
| 1355 |
+
|
| 1356 |
+
# Mark these elements to skip their text content
|
| 1357 |
+
skip_elements = set()
|
| 1358 |
+
for elem in elements_with_handlers:
|
| 1359 |
+
skip_elements.add(elem)
|
| 1360 |
+
skip_elements.update(elem.descendants)
|
| 1361 |
+
|
| 1362 |
# Get all text nodes using string instead of text (fixing deprecation)
|
| 1363 |
for element in soup.find_all(string=True):
|
| 1364 |
# Skip script, style, and noscript content completely
|
| 1365 |
if element.parent.name in ['script', 'style', 'noscript']:
|
| 1366 |
continue
|
| 1367 |
|
| 1368 |
+
# Skip if element or any parent has event handlers
|
| 1369 |
+
if any(parent in skip_elements for parent in element.parents):
|
| 1370 |
+
continue
|
| 1371 |
+
|
| 1372 |
text = element.strip()
|
| 1373 |
if text and not self.should_skip_element(element, text):
|
| 1374 |
text_elements.append({
|
|
|
|
| 1387 |
# Fix spacing issues
|
| 1388 |
html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
|
| 1389 |
html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
|
| 1390 |
+
html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening
|
| 1391 |
|
| 1392 |
# Fix common word errors that might occur during processing
|
| 1393 |
html_text = html_text.replace('down loaded', 'downloaded')
|
|
|
|
| 1468 |
preserved_scripts = []
|
| 1469 |
preserved_styles = []
|
| 1470 |
|
| 1471 |
+
# Parse HTML
|
| 1472 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 1473 |
|
| 1474 |
+
# Find and preserve all script tags WITH their content
|
| 1475 |
+
for idx, script in enumerate(soup.find_all('script')):
|
| 1476 |
+
placeholder = BeautifulSoup(f'<div>{script_placeholder.format(idx)}</div>', 'html.parser').div
|
| 1477 |
preserved_scripts.append(str(script))
|
| 1478 |
script.replace_with(placeholder)
|
| 1479 |
|
| 1480 |
# Preserve all style tags
|
| 1481 |
+
for idx, style in enumerate(soup.find_all('style')):
|
| 1482 |
+
placeholder = BeautifulSoup(f'<div>{style_placeholder.format(idx)}</div>', 'html.parser').div
|
| 1483 |
preserved_styles.append(str(style))
|
| 1484 |
style.replace_with(placeholder)
|
| 1485 |
|
| 1486 |
# Get the modified HTML
|
| 1487 |
+
html_content = str(soup)
|
| 1488 |
|
| 1489 |
try:
|
| 1490 |
# Extract text elements
|
|
|
|
| 1548 |
# Restore all script tags
|
| 1549 |
for idx, script_content in enumerate(preserved_scripts):
|
| 1550 |
placeholder = script_placeholder.format(idx)
|
| 1551 |
+
result = result.replace(f'<div>{placeholder}</div>', script_content)
|
| 1552 |
|
| 1553 |
# Restore all style tags
|
| 1554 |
for idx, style_content in enumerate(preserved_styles):
|
| 1555 |
placeholder = style_placeholder.format(idx)
|
| 1556 |
+
result = result.replace(f'<div>{placeholder}</div>', style_content)
|
| 1557 |
|
| 1558 |
# Post-process the entire HTML to fix bold/strong formatting
|
| 1559 |
result = self.post_process_html(result)
|