EdysorEdutech commited on
Commit
942fc21
·
verified ·
1 Parent(s): d97439e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -171
app.py CHANGED
@@ -663,150 +663,122 @@ class EnhancedDipperHumanizer:
663
  return text
664
 
665
  def should_skip_element(self, element, text):
666
- """Determine if an element should be skipped from paraphrasing"""
667
- if not text or len(text.strip()) < 3:
 
 
 
 
 
 
 
 
 
 
668
  return True
669
 
670
- # Skip JavaScript code inside script tags
671
- parent = element.parent
672
- if parent and parent.name in ['script', 'style', 'noscript']:
673
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
 
675
- # Skip inline JavaScript (onclick, onchange, etc.)
676
- if parent and parent.attrs:
677
- for attr_name, attr_value in parent.attrs.items():
678
- if attr_name.startswith('on') and 'selectOption' in str(attr_value):
679
- return True
680
 
681
- # Skip if text contains JavaScript function calls
682
- if 'function' in text or 'selectOption' in text or '=>' in text:
683
- return True
 
 
684
 
685
- # Skip headings (h1-h6)
686
- if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
687
- return True
688
 
689
- # Skip content inside <strong> and <b> tags
690
- if parent and parent.name in ['strong', 'b']:
691
- return True
692
 
693
- # Skip table content
694
- if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)):
695
- return True
 
 
 
696
 
697
- # Special handling for content inside tables
698
- # Skip if it's inside strong/b/h1-h6 tags AND also inside a table
699
- if parent:
700
- # Check if we're inside a table
701
- is_in_table = any(p.name == 'table' for p in parent.parents)
702
- if is_in_table:
703
- # If we're in a table, skip any text that's inside formatting tags
704
- if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
705
- return True
706
- # Also check if parent's parent is a formatting tag
707
- if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
708
- return True
709
-
710
- # Skip table of contents
711
- if parent:
712
- parent_text = str(parent).lower()
713
- if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
714
- return True
715
-
716
- # Skip CTAs and buttons
717
- if parent and parent.name in ['button', 'a']:
718
- return True
719
 
720
- # Skip if parent has onclick or other event handlers
721
- if parent and parent.attrs:
722
- event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout']
723
- if any(handler in parent.attrs for handler in event_handlers):
724
- return True
725
-
726
- # Special check for testimonial cards - check up to 3 levels of ancestors
727
- if parent:
728
- ancestors_to_check = []
729
- current = parent
730
- for _ in range(3): # Check up to 3 levels up
731
- if current:
732
- ancestors_to_check.append(current)
733
- current = current.parent
734
-
735
- # Check if any ancestor has testimonial-card class
736
- for ancestor in ancestors_to_check:
737
- if ancestor and ancestor.get('class'):
738
- classes = ancestor.get('class', [])
739
- if isinstance(classes, list):
740
- if any('testimonial-card' in str(cls) for cls in classes):
741
- return True
742
- elif isinstance(classes, str) and 'testimonial-card' in classes:
743
- return True
744
-
745
- # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
746
- skip_indicators = [
747
- 'button', 'btn', 'heading', 'title', 'caption',
748
- 'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
749
- 'warning', 'info', 'success', 'error', 'code', 'pre',
750
- 'stats-grid', 'testimonial-card',
751
- 'cta-box', 'quiz-container', 'contact-form',
752
- 'faq-question', 'sidebar', 'widget', 'banner',
753
- 'author-intro', 'testimonial', 'review', 'feedback',
754
- 'floating-', 'stat-', 'progress-', 'option', 'results',
755
- 'question-container', 'quiz-',
756
- 'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
757
- ]
758
 
759
- # Skip quiz-related elements
760
- quiz_indicators = ['quiz-container', 'question-container', 'questionContainer',
761
- 'results', 'progressFill', 'currentQuestion', 'totalQuestions']
762
 
763
- # Check only immediate parent and grandparent (not all ancestors)
764
- elements_to_check = [parent]
765
- if parent and parent.parent:
766
- elements_to_check.append(parent.parent)
767
-
768
- for elem in elements_to_check:
769
- if not elem:
770
- continue
771
-
772
- # Check element's class
773
- elem_class = elem.get('class', [])
774
- if isinstance(elem_class, list):
775
- class_str = ' '.join(str(cls).lower() for cls in elem_class)
776
- if any(indicator in class_str for indicator in skip_indicators):
777
- return True
778
- if any(indicator in class_str for indicator in quiz_indicators):
779
- return True
780
-
781
- # Check element's ID
782
- elem_id = elem.get('id', '')
783
- if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
784
- return True
785
- if elem_id in quiz_indicators:
786
- return True
787
-
788
- # Skip short phrases that might be UI elements
789
- word_count = len(text.split())
790
- if word_count <= 5:
791
- ui_patterns = [
792
- 'click', 'download', 'learn more', 'read more', 'sign up',
793
- 'get started', 'try now', 'buy now', 'next', 'previous',
794
- 'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
795
- 'check out:', 'see also:', 'related:', 'question', 'of'
796
- ]
797
- if any(pattern in text.lower() for pattern in ui_patterns):
798
- return True
799
-
800
- # Skip very short content in styled containers
801
- if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
802
- style = parent.get('style', '')
803
- if 'border' in style or 'background' in style:
804
- if word_count <= 20:
805
- # But don't skip if it's inside a paragraph
806
- if not any(p.name == 'p' for p in parent.parents):
807
- return True
808
-
809
- return False
810
 
811
  def is_likely_acronym_or_proper_noun(self, word):
812
  """Check if a word is likely an acronym or part of a proper noun"""
@@ -1350,25 +1322,12 @@ class EnhancedDipperHumanizer:
1350
  soup = BeautifulSoup(html_content, 'html.parser')
1351
  text_elements = []
1352
 
1353
- # Get all elements with onclick, onchange, etc.
1354
- elements_with_handlers = soup.find_all(attrs=lambda x: x and any(k.startswith('on') for k in x.keys()))
1355
-
1356
- # Mark these elements to skip their text content
1357
- skip_elements = set()
1358
- for elem in elements_with_handlers:
1359
- skip_elements.add(elem)
1360
- skip_elements.update(elem.descendants)
1361
-
1362
  # Get all text nodes using string instead of text (fixing deprecation)
1363
  for element in soup.find_all(string=True):
1364
  # Skip script, style, and noscript content completely
1365
  if element.parent.name in ['script', 'style', 'noscript']:
1366
  continue
1367
 
1368
- # Skip if element or any parent has event handlers
1369
- if any(parent in skip_elements for parent in element.parents):
1370
- continue
1371
-
1372
  text = element.strip()
1373
  if text and not self.should_skip_element(element, text):
1374
  text_elements.append({
@@ -1379,21 +1338,42 @@ class EnhancedDipperHumanizer:
1379
  return soup, text_elements
1380
 
1381
  def validate_and_fix_html(self, html_text):
1382
- """Fix common HTML syntax errors after processing"""
1383
-
1384
- # Fix DOCTYPE
1385
- html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
1386
-
1387
- # Fix spacing issues
1388
- html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
1389
- html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
1390
- html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening
1391
-
1392
- # Fix common word errors that might occur during processing
1393
- html_text = html_text.replace('down loaded', 'downloaded')
1394
- html_text = html_text.replace('But your document', 'Your document')
1395
-
1396
- return html_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1397
 
1398
  def add_natural_flow_variations(self, text):
1399
  """Add more natural flow and rhythm variations for Originality AI"""
@@ -1468,23 +1448,23 @@ class EnhancedDipperHumanizer:
1468
  preserved_scripts = []
1469
  preserved_styles = []
1470
 
1471
- # Parse HTML
1472
- soup = BeautifulSoup(html_content, 'html.parser')
1473
 
1474
- # Find and preserve all script tags WITH their content
1475
- for idx, script in enumerate(soup.find_all('script')):
1476
- placeholder = BeautifulSoup(f'<div>{script_placeholder.format(idx)}</div>', 'html.parser').div
1477
  preserved_scripts.append(str(script))
1478
  script.replace_with(placeholder)
1479
 
1480
  # Preserve all style tags
1481
- for idx, style in enumerate(soup.find_all('style')):
1482
- placeholder = BeautifulSoup(f'<div>{style_placeholder.format(idx)}</div>', 'html.parser').div
1483
  preserved_styles.append(str(style))
1484
  style.replace_with(placeholder)
1485
 
1486
  # Get the modified HTML
1487
- html_content = str(soup)
1488
 
1489
  try:
1490
  # Extract text elements
@@ -1548,12 +1528,12 @@ class EnhancedDipperHumanizer:
1548
  # Restore all script tags
1549
  for idx, script_content in enumerate(preserved_scripts):
1550
  placeholder = script_placeholder.format(idx)
1551
- result = result.replace(f'<div>{placeholder}</div>', script_content)
1552
 
1553
  # Restore all style tags
1554
  for idx, style_content in enumerate(preserved_styles):
1555
  placeholder = style_placeholder.format(idx)
1556
- result = result.replace(f'<div>{placeholder}</div>', style_content)
1557
 
1558
  # Post-process the entire HTML to fix bold/strong formatting
1559
  result = self.post_process_html(result)
 
663
  return text
664
 
665
  def should_skip_element(self, element, text):
666
+ """Determine if an element should be skipped from paraphrasing"""
667
+ if not text or len(text.strip()) < 3:
668
+ return True
669
+
670
+ # Skip JavaScript code inside script tags - CRITICAL FIX
671
+ parent = element.parent
672
+ if parent and parent.name in ['script', 'style', 'noscript']:
673
+ return True
674
+
675
+ # Also check if we're inside a script tag at any level
676
+ for ancestor in element.parents:
677
+ if ancestor.name in ['script', 'style', 'noscript']:
678
  return True
679
 
680
+ # Rest of your existing skip logic...
681
+ return False
682
+
683
+ def extract_text_from_html(self, html_content):
684
+ """Extract text elements from HTML with skip logic"""
685
+ soup = BeautifulSoup(html_content, 'html.parser')
686
+ text_elements = []
687
+
688
+ # CRITICAL: Preserve all script tags completely
689
+ script_tags = soup.find_all('script')
690
+ script_placeholders = {}
691
+
692
+ for i, script in enumerate(script_tags):
693
+ placeholder = f"###SCRIPT_CONTENT_{i}###"
694
+ script_placeholders[placeholder] = str(script)
695
+ script.string = placeholder
696
+
697
+ # Get all text nodes
698
+ for element in soup.find_all(string=True):
699
+ # Skip script, style, and noscript content completely
700
+ if element.parent.name in ['script', 'style', 'noscript']:
701
+ continue
702
+
703
+ # Skip if it's a script placeholder
704
+ text = element.strip()
705
+ if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
706
+ continue
707
+
708
+ if text and not self.should_skip_element(element, text):
709
+ text_elements.append({
710
+ 'text': text,
711
+ 'element': element
712
+ })
713
+
714
+ return soup, text_elements, script_placeholders
715
+
716
+ def process_html(self, html_content, progress_callback=None):
717
+ """Main processing function with progress callback"""
718
+ if not html_content.strip():
719
+ return "Please provide HTML content."
720
+
721
+ try:
722
+ # Extract text elements with script preservation
723
+ soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
724
 
725
+ total_elements = len(text_elements)
726
+ print(f"Found {total_elements} text elements to process (after filtering)")
 
 
 
727
 
728
+ # Process each text element
729
+ processed_count = 0
730
+
731
+ for i, element_info in enumerate(text_elements):
732
+ original_text = element_info['text']
733
 
734
+ # Skip placeholders
735
+ if "###SCRIPT_" in original_text:
736
+ continue
737
 
738
+ # Skip very short texts
739
+ if len(original_text.split()) < 3:
740
+ continue
741
 
742
+ # Process the text with your existing logic
743
+ paraphrased_text = self.paraphrase_with_dipper(
744
+ original_text,
745
+ lex_diversity=60,
746
+ order_diversity=20
747
+ )
748
 
749
+ # Apply other transformations...
750
+ paraphrased_text = self.apply_sentence_variation(paraphrased_text)
751
+ paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
752
+ paraphrased_text = self.fix_punctuation(paraphrased_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
 
754
+ # Final quality check
755
+ if paraphrased_text and len(paraphrased_text.split()) >= 3:
756
+ element_info['element'].replace_with(NavigableString(paraphrased_text))
757
+ processed_count += 1
758
+
759
+ # Progress update
760
+ if progress_callback:
761
+ progress_callback(i + 1, total_elements)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
 
763
+ # Get the processed HTML
764
+ result_html = str(soup)
 
765
 
766
+ # CRITICAL: Restore all script content exactly as it was
767
+ for placeholder, original_script in script_placeholders.items():
768
+ result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
769
+
770
+ # Post-process the entire HTML
771
+ result_html = self.post_process_html(result_html)
772
+ result_html = self.validate_and_fix_html(result_html)
773
+
774
+ print(f"Successfully processed {processed_count} text elements")
775
+ return result_html
776
+
777
+ except Exception as e:
778
+ import traceback
779
+ error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
780
+ print(error_msg)
781
+ return f"<!-- {error_msg} -->\n{html_content}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
 
783
  def is_likely_acronym_or_proper_noun(self, word):
784
  """Check if a word is likely an acronym or part of a proper noun"""
 
1322
  soup = BeautifulSoup(html_content, 'html.parser')
1323
  text_elements = []
1324
 
 
 
 
 
 
 
 
 
 
1325
  # Get all text nodes using string instead of text (fixing deprecation)
1326
  for element in soup.find_all(string=True):
1327
  # Skip script, style, and noscript content completely
1328
  if element.parent.name in ['script', 'style', 'noscript']:
1329
  continue
1330
 
 
 
 
 
1331
  text = element.strip()
1332
  if text and not self.should_skip_element(element, text):
1333
  text_elements.append({
 
1338
  return soup, text_elements
1339
 
1340
  def validate_and_fix_html(self, html_text):
1341
+ """Fix common HTML syntax errors after processing"""
1342
+
1343
+ # First, protect script content
1344
+ script_pattern = r'<script[^>]*>(.*?)</script>'
1345
+ scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
1346
+ script_placeholders = {}
1347
+
1348
+ for i, script_content in enumerate(scripts):
1349
+ placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
1350
+ script_placeholders[placeholder] = script_content
1351
+ html_text = html_text.replace(
1352
+ f'<script>{script_content}</script>',
1353
+ f'<script>{placeholder}</script>',
1354
+ 1
1355
+ )
1356
+
1357
+ # Fix DOCTYPE
1358
+ html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
1359
+
1360
+ # Fix spacing issues (but not inside scripts)
1361
+ html_text = re.sub(r'>\s+<', '><', html_text)
1362
+ html_text = re.sub(r'\s+>', '>', html_text)
1363
+ html_text = re.sub(r'<\s+', '<', html_text)
1364
+
1365
+ # Fix common word errors that might occur during processing
1366
+ html_text = html_text.replace('down loaded', 'downloaded')
1367
+ html_text = html_text.replace('But your document', 'Your document')
1368
+
1369
+ # Restore script content
1370
+ for placeholder, script_content in script_placeholders.items():
1371
+ html_text = html_text.replace(
1372
+ f'<script>{placeholder}</script>',
1373
+ f'<script>{script_content}</script>'
1374
+ )
1375
+
1376
+ return html_text
1377
 
1378
  def add_natural_flow_variations(self, text):
1379
  """Add more natural flow and rhythm variations for Originality AI"""
 
1448
  preserved_scripts = []
1449
  preserved_styles = []
1450
 
1451
+ # Temporarily replace script and style tags with placeholders
1452
+ soup_temp = BeautifulSoup(html_content, 'html.parser')
1453
 
1454
+ # Preserve all script tags
1455
+ for idx, script in enumerate(soup_temp.find_all('script')):
1456
+ placeholder = script_placeholder.format(idx)
1457
  preserved_scripts.append(str(script))
1458
  script.replace_with(placeholder)
1459
 
1460
  # Preserve all style tags
1461
+ for idx, style in enumerate(soup_temp.find_all('style')):
1462
+ placeholder = style_placeholder.format(idx)
1463
  preserved_styles.append(str(style))
1464
  style.replace_with(placeholder)
1465
 
1466
  # Get the modified HTML
1467
+ html_content = str(soup_temp)
1468
 
1469
  try:
1470
  # Extract text elements
 
1528
  # Restore all script tags
1529
  for idx, script_content in enumerate(preserved_scripts):
1530
  placeholder = script_placeholder.format(idx)
1531
+ result = result.replace(placeholder, script_content)
1532
 
1533
  # Restore all style tags
1534
  for idx, style_content in enumerate(preserved_styles):
1535
  placeholder = style_placeholder.format(idx)
1536
+ result = result.replace(placeholder, style_content)
1537
 
1538
  # Post-process the entire HTML to fix bold/strong formatting
1539
  result = self.post_process_html(result)