EdysorEdutech commited on
Commit
d97439e
·
verified ·
1 Parent(s): b39d59a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -11
app.py CHANGED
@@ -671,6 +671,16 @@ class EnhancedDipperHumanizer:
671
  parent = element.parent
672
  if parent and parent.name in ['script', 'style', 'noscript']:
673
  return True
 
 
 
 
 
 
 
 
 
 
674
 
675
  # Skip headings (h1-h6)
676
  if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
@@ -746,6 +756,10 @@ class EnhancedDipperHumanizer:
746
  'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
747
  ]
748
 
 
 
 
 
749
  # Check only immediate parent and grandparent (not all ancestors)
750
  elements_to_check = [parent]
751
  if parent and parent.parent:
@@ -761,11 +775,15 @@ class EnhancedDipperHumanizer:
761
  class_str = ' '.join(str(cls).lower() for cls in elem_class)
762
  if any(indicator in class_str for indicator in skip_indicators):
763
  return True
 
 
764
 
765
  # Check element's ID
766
  elem_id = elem.get('id', '')
767
  if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
768
  return True
 
 
769
 
770
  # Skip short phrases that might be UI elements
771
  word_count = len(text.split())
@@ -1332,12 +1350,25 @@ class EnhancedDipperHumanizer:
1332
  soup = BeautifulSoup(html_content, 'html.parser')
1333
  text_elements = []
1334
 
 
 
 
 
 
 
 
 
 
1335
  # Get all text nodes using string instead of text (fixing deprecation)
1336
  for element in soup.find_all(string=True):
1337
  # Skip script, style, and noscript content completely
1338
  if element.parent.name in ['script', 'style', 'noscript']:
1339
  continue
1340
 
 
 
 
 
1341
  text = element.strip()
1342
  if text and not self.should_skip_element(element, text):
1343
  text_elements.append({
@@ -1356,7 +1387,7 @@ class EnhancedDipperHumanizer:
1356
  # Fix spacing issues
1357
  html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
1358
  html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
1359
- html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening <
1360
 
1361
  # Fix common word errors that might occur during processing
1362
  html_text = html_text.replace('down loaded', 'downloaded')
@@ -1437,23 +1468,23 @@ class EnhancedDipperHumanizer:
1437
  preserved_scripts = []
1438
  preserved_styles = []
1439
 
1440
- # Temporarily replace script and style tags with placeholders
1441
- soup_temp = BeautifulSoup(html_content, 'html.parser')
1442
 
1443
- # Preserve all script tags
1444
- for idx, script in enumerate(soup_temp.find_all('script')):
1445
- placeholder = script_placeholder.format(idx)
1446
  preserved_scripts.append(str(script))
1447
  script.replace_with(placeholder)
1448
 
1449
  # Preserve all style tags
1450
- for idx, style in enumerate(soup_temp.find_all('style')):
1451
- placeholder = style_placeholder.format(idx)
1452
  preserved_styles.append(str(style))
1453
  style.replace_with(placeholder)
1454
 
1455
  # Get the modified HTML
1456
- html_content = str(soup_temp)
1457
 
1458
  try:
1459
  # Extract text elements
@@ -1517,12 +1548,12 @@ class EnhancedDipperHumanizer:
1517
  # Restore all script tags
1518
  for idx, script_content in enumerate(preserved_scripts):
1519
  placeholder = script_placeholder.format(idx)
1520
- result = result.replace(placeholder, script_content)
1521
 
1522
  # Restore all style tags
1523
  for idx, style_content in enumerate(preserved_styles):
1524
  placeholder = style_placeholder.format(idx)
1525
- result = result.replace(placeholder, style_content)
1526
 
1527
  # Post-process the entire HTML to fix bold/strong formatting
1528
  result = self.post_process_html(result)
 
671
  parent = element.parent
672
  if parent and parent.name in ['script', 'style', 'noscript']:
673
  return True
674
+
675
+ # Skip inline JavaScript (onclick, onchange, etc.)
676
+ if parent and parent.attrs:
677
+ for attr_name, attr_value in parent.attrs.items():
678
+ if attr_name.startswith('on') and 'selectOption' in str(attr_value):
679
+ return True
680
+
681
+ # Skip if text contains JavaScript function calls
682
+ if 'function' in text or 'selectOption' in text or '=>' in text:
683
+ return True
684
 
685
  # Skip headings (h1-h6)
686
  if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
 
756
  'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
757
  ]
758
 
759
+ # Skip quiz-related elements
760
+ quiz_indicators = ['quiz-container', 'question-container', 'questionContainer',
761
+ 'results', 'progressFill', 'currentQuestion', 'totalQuestions']
762
+
763
  # Check only immediate parent and grandparent (not all ancestors)
764
  elements_to_check = [parent]
765
  if parent and parent.parent:
 
775
  class_str = ' '.join(str(cls).lower() for cls in elem_class)
776
  if any(indicator in class_str for indicator in skip_indicators):
777
  return True
778
+ if any(indicator in class_str for indicator in quiz_indicators):
779
+ return True
780
 
781
  # Check element's ID
782
  elem_id = elem.get('id', '')
783
  if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
784
  return True
785
+ if elem_id in quiz_indicators:
786
+ return True
787
 
788
  # Skip short phrases that might be UI elements
789
  word_count = len(text.split())
 
1350
  soup = BeautifulSoup(html_content, 'html.parser')
1351
  text_elements = []
1352
 
1353
+ # Get all elements with onclick, onchange, etc.
1354
+ elements_with_handlers = soup.find_all(attrs=lambda x: x and any(k.startswith('on') for k in x.keys()))
1355
+
1356
+ # Mark these elements to skip their text content
1357
+ skip_elements = set()
1358
+ for elem in elements_with_handlers:
1359
+ skip_elements.add(elem)
1360
+ skip_elements.update(elem.descendants)
1361
+
1362
  # Get all text nodes using string instead of text (fixing deprecation)
1363
  for element in soup.find_all(string=True):
1364
  # Skip script, style, and noscript content completely
1365
  if element.parent.name in ['script', 'style', 'noscript']:
1366
  continue
1367
 
1368
+ # Skip if element or any parent has event handlers
1369
+ if any(parent in skip_elements for parent in element.parents):
1370
+ continue
1371
+
1372
  text = element.strip()
1373
  if text and not self.should_skip_element(element, text):
1374
  text_elements.append({
 
1387
  # Fix spacing issues
1388
  html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
1389
  html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
1390
+ html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening
1391
 
1392
  # Fix common word errors that might occur during processing
1393
  html_text = html_text.replace('down loaded', 'downloaded')
 
1468
  preserved_scripts = []
1469
  preserved_styles = []
1470
 
1471
+ # Parse HTML
1472
+ soup = BeautifulSoup(html_content, 'html.parser')
1473
 
1474
+ # Find and preserve all script tags WITH their content
1475
+ for idx, script in enumerate(soup.find_all('script')):
1476
+ placeholder = BeautifulSoup(f'<div>{script_placeholder.format(idx)}</div>', 'html.parser').div
1477
  preserved_scripts.append(str(script))
1478
  script.replace_with(placeholder)
1479
 
1480
  # Preserve all style tags
1481
+ for idx, style in enumerate(soup.find_all('style')):
1482
+ placeholder = BeautifulSoup(f'<div>{style_placeholder.format(idx)}</div>', 'html.parser').div
1483
  preserved_styles.append(str(style))
1484
  style.replace_with(placeholder)
1485
 
1486
  # Get the modified HTML
1487
+ html_content = str(soup)
1488
 
1489
  try:
1490
  # Extract text elements
 
1548
  # Restore all script tags
1549
  for idx, script_content in enumerate(preserved_scripts):
1550
  placeholder = script_placeholder.format(idx)
1551
+ result = result.replace(f'<div>{placeholder}</div>', script_content)
1552
 
1553
  # Restore all style tags
1554
  for idx, style_content in enumerate(preserved_styles):
1555
  placeholder = style_placeholder.format(idx)
1556
+ result = result.replace(f'<div>{placeholder}</div>', style_content)
1557
 
1558
  # Post-process the entire HTML to fix bold/strong formatting
1559
  result = self.post_process_html(result)