EdysorEdutech commited on
Commit
99d994a
·
verified ·
1 Parent(s): 0bc69df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -263
app.py CHANGED
@@ -666,17 +666,12 @@ class EnhancedDipperHumanizer:
666
  """Determine if an element should be skipped from paraphrasing"""
667
  if not text or len(text.strip()) < 3:
668
  return True
669
-
670
- # Skip JavaScript code inside script tags - CRITICAL FIX
671
  parent = element.parent
672
  if parent and parent.name in ['script', 'style', 'noscript']:
673
  return True
674
-
675
- # Also check if we're inside a script tag at any level
676
- for ancestor in element.parents:
677
- if ancestor.name in ['script', 'style', 'noscript']:
678
- return True
679
-
680
  # Skip headings (h1-h6)
681
  if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
682
  return True
@@ -792,237 +787,8 @@ class EnhancedDipperHumanizer:
792
  # But don't skip if it's inside a paragraph
793
  if not any(p.name == 'p' for p in parent.parents):
794
  return True
795
-
796
  return False
797
-
798
- def extract_text_from_html(self, html_content):
799
- """Extract text elements from HTML with skip logic"""
800
- soup = BeautifulSoup(html_content, 'html.parser')
801
- text_elements = []
802
-
803
- # CRITICAL: Preserve all script tags completely
804
- script_tags = soup.find_all('script')
805
- script_placeholders = {}
806
-
807
- for i, script in enumerate(script_tags):
808
- placeholder = f"###SCRIPT_CONTENT_{i}###"
809
- script_placeholders[placeholder] = str(script)
810
- script.string = placeholder
811
-
812
- # Get all text nodes
813
- for element in soup.find_all(string=True):
814
- # Skip script, style, and noscript content completely
815
- if element.parent.name in ['script', 'style', 'noscript']:
816
- continue
817
-
818
- # Skip if it's a script placeholder
819
- text = element.strip()
820
- if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
821
- continue
822
-
823
- if text and not self.should_skip_element(element, text):
824
- text_elements.append({
825
- 'text': text,
826
- 'element': element
827
- })
828
-
829
- return soup, text_elements, script_placeholders
830
-
831
- def process_html(self, html_content, progress_callback=None):
832
- """Main processing function with progress callback"""
833
- if not html_content.strip():
834
- return "Please provide HTML content."
835
-
836
- # Parse the HTML first
837
- soup_initial = BeautifulSoup(html_content, 'html.parser')
838
-
839
- # Store ALL scripts (both external and inline) with their full content
840
- script_storage = []
841
- script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
842
-
843
- # Find and replace all script tags
844
- for idx, script in enumerate(soup_initial.find_all('script')):
845
- placeholder = script_placeholder_template.format(idx)
846
- # Store the entire script tag as a string
847
- script_storage.append(str(script))
848
- # Replace with a comment placeholder
849
- new_tag = soup_initial.new_string(placeholder)
850
- script.replace_with(new_tag)
851
-
852
- # Also store and replace style tags
853
- style_storage = []
854
- style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
855
-
856
- for idx, style in enumerate(soup_initial.find_all('style')):
857
- placeholder = style_placeholder_template.format(idx)
858
- style_storage.append(str(style))
859
- new_tag = soup_initial.new_string(placeholder)
860
- style.replace_with(new_tag)
861
-
862
- # Get the modified HTML
863
- html_content = str(soup_initial)
864
-
865
- try:
866
- # Extract text elements
867
- soup, text_elements = self.extract_text_from_html(html_content)
868
-
869
- total_elements = len(text_elements)
870
- print(f"Found {total_elements} text elements to process (after filtering)")
871
-
872
- # Process each text element
873
- processed_count = 0
874
-
875
- for i, element_info in enumerate(text_elements):
876
- original_text = element_info['text']
877
-
878
- # Skip placeholders
879
- if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
880
- continue
881
-
882
- # Skip very short texts
883
- if len(original_text.split()) < 3:
884
- continue
885
-
886
- # First pass with Dipper
887
- paraphrased_text = self.paraphrase_with_dipper(
888
- original_text,
889
- lex_diversity=60,
890
- order_diversity=20
891
- )
892
-
893
- # Second pass with BART for longer texts (balanced probability)
894
- if self.use_bart and len(paraphrased_text.split()) > 8:
895
- # 30% chance to use BART for more variation (balanced)
896
- if random.random() < 0.3:
897
- paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
898
-
899
- # Apply sentence variation
900
- paraphrased_text = self.apply_sentence_variation(paraphrased_text)
901
-
902
- # Add natural flow variations
903
- paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
904
-
905
- # Fix punctuation and formatting
906
- paraphrased_text = self.fix_punctuation(paraphrased_text)
907
-
908
- # Final quality check
909
- if paraphrased_text and len(paraphrased_text.split()) >= 3:
910
- element_info['element'].replace_with(NavigableString(paraphrased_text))
911
- processed_count += 1
912
-
913
- # Progress update
914
- if progress_callback:
915
- progress_callback(i + 1, total_elements)
916
-
917
- if i % 10 == 0 or i == total_elements - 1:
918
- progress = (i + 1) / total_elements * 100
919
- print(f"Progress: {progress:.1f}%")
920
-
921
- # Get the processed HTML
922
- result = str(soup)
923
-
924
- # Restore all script tags exactly as they were
925
- for idx, script_content in enumerate(script_storage):
926
- placeholder = script_placeholder_template.format(idx)
927
- result = result.replace(placeholder, script_content)
928
-
929
- # Restore all style tags exactly as they were
930
- for idx, style_content in enumerate(style_storage):
931
- placeholder = style_placeholder_template.format(idx)
932
- result = result.replace(placeholder, style_content)
933
-
934
- # Post-process the entire HTML to fix bold/strong formatting
935
- result = self.post_process_html(result)
936
-
937
- # Validate and fix HTML syntax (but protect scripts)
938
- result = self.validate_and_fix_html_safe(result)
939
-
940
- print(f"Successfully processed {processed_count} text elements")
941
- print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
942
-
943
- return result
944
-
945
- except Exception as e:
946
- import traceback
947
- error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
948
- print(error_msg)
949
- # Return original HTML with error message prepended as HTML comment
950
- return f"<!-- {error_msg} -->\n{html_content}"
951
-
952
- def validate_and_fix_html_safe(self, html_text):
953
- """Fix common HTML syntax errors after processing while protecting scripts"""
954
-
955
- # First, extract and protect script content
956
- script_pattern = r'<script[^>]*>.*?</script>'
957
- scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
958
- script_placeholders = {}
959
-
960
- for i, script_content in enumerate(scripts):
961
- placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
962
- script_placeholders[placeholder] = script_content
963
- html_text = html_text.replace(script_content, placeholder, 1)
964
-
965
- # Also protect style tags
966
- style_pattern = r'<style[^>]*>.*?</style>'
967
- styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
968
- style_placeholders = {}
969
-
970
- for i, style_content in enumerate(styles):
971
- placeholder = f"<!--PROTECTED_STYLE_{i}-->"
972
- style_placeholders[placeholder] = style_content
973
- html_text = html_text.replace(style_content, placeholder, 1)
974
-
975
- # Fix DOCTYPE
976
- html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
977
-
978
- # Fix spacing issues (but not inside scripts/styles)
979
- html_text = re.sub(r'>\s+<', '><', html_text)
980
- html_text = re.sub(r'\s+>', '>', html_text)
981
- html_text = re.sub(r'<\s+', '<', html_text)
982
-
983
- # Fix common word errors that might occur during processing
984
- html_text = html_text.replace('down loaded', 'downloaded')
985
- html_text = html_text.replace('But your document', 'Your document')
986
-
987
- # Restore protected scripts
988
- for placeholder, script_content in script_placeholders.items():
989
- html_text = html_text.replace(placeholder, script_content)
990
-
991
- # Restore protected styles
992
- for placeholder, style_content in style_placeholders.items():
993
- html_text = html_text.replace(placeholder, style_content)
994
-
995
- return html_text
996
-
997
- def extract_text_from_html(self, html_content):
998
- """Extract text elements from HTML with skip logic"""
999
- soup = BeautifulSoup(html_content, 'html.parser')
1000
- text_elements = []
1001
-
1002
- # Get all text nodes using strings (the correct method)
1003
- for element in soup.strings:
1004
- # Skip if parent is script, style, or noscript
1005
- if element.parent.name in ['script', 'style', 'noscript']:
1006
- continue
1007
-
1008
- text = element.strip()
1009
-
1010
- # Skip empty strings
1011
- if not text:
1012
- continue
1013
-
1014
- # Skip placeholder texts
1015
- if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
1016
- continue
1017
-
1018
- # Use the existing should_skip_element logic
1019
- if not self.should_skip_element(element, text):
1020
- text_elements.append({
1021
- 'text': text,
1022
- 'element': element
1023
- })
1024
-
1025
- return soup, text_elements
1026
 
1027
  def is_likely_acronym_or_proper_noun(self, word):
1028
  """Check if a word is likely an acronym or part of a proper noun"""
@@ -1561,42 +1327,41 @@ class EnhancedDipperHumanizer:
1561
 
1562
  return text
1563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1564
  def validate_and_fix_html(self, html_text):
1565
  """Fix common HTML syntax errors after processing"""
1566
 
1567
- # First, protect script content
1568
- script_pattern = r'<script[^>]*>(.*?)</script>'
1569
- scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
1570
- script_placeholders = {}
1571
-
1572
- for i, script_content in enumerate(scripts):
1573
- placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
1574
- script_placeholders[placeholder] = script_content
1575
- html_text = html_text.replace(
1576
- f'<script>{script_content}</script>',
1577
- f'<script>{placeholder}</script>',
1578
- 1
1579
- )
1580
-
1581
  # Fix DOCTYPE
1582
  html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
1583
 
1584
- # Fix spacing issues (but not inside scripts)
1585
- html_text = re.sub(r'>\s+<', '><', html_text)
1586
- html_text = re.sub(r'\s+>', '>', html_text)
1587
- html_text = re.sub(r'<\s+', '<', html_text)
1588
 
1589
  # Fix common word errors that might occur during processing
1590
  html_text = html_text.replace('down loaded', 'downloaded')
1591
  html_text = html_text.replace('But your document', 'Your document')
1592
 
1593
- # Restore script content
1594
- for placeholder, script_content in script_placeholders.items():
1595
- html_text = html_text.replace(
1596
- f'<script>{placeholder}</script>',
1597
- f'<script>{script_content}</script>'
1598
- )
1599
-
1600
  return html_text
1601
 
1602
  def add_natural_flow_variations(self, text):
@@ -1661,6 +1426,127 @@ class EnhancedDipperHumanizer:
1661
 
1662
  return ' '.join(enhanced_sentences)
1663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1664
  def post_process_html(self, html_text):
1665
  """Post-process the entire HTML to fix formatting issues"""
1666
  # Fix empty angle brackets that might appear
 
666
  """Determine if an element should be skipped from paraphrasing"""
667
  if not text or len(text.strip()) < 3:
668
  return True
669
+
670
+ # Skip JavaScript code inside script tags
671
  parent = element.parent
672
  if parent and parent.name in ['script', 'style', 'noscript']:
673
  return True
674
+
 
 
 
 
 
675
  # Skip headings (h1-h6)
676
  if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
677
  return True
 
787
  # But don't skip if it's inside a paragraph
788
  if not any(p.name == 'p' for p in parent.parents):
789
  return True
790
+
791
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
 
793
  def is_likely_acronym_or_proper_noun(self, word):
794
  """Check if a word is likely an acronym or part of a proper noun"""
 
1327
 
1328
  return text
1329
 
1330
+ def extract_text_from_html(self, html_content):
1331
+ """Extract text elements from HTML with skip logic"""
1332
+ soup = BeautifulSoup(html_content, 'html.parser')
1333
+ text_elements = []
1334
+
1335
+ # Get all text nodes using string instead of text (fixing deprecation)
1336
+ for element in soup.find_all(string=True):
1337
+ # Skip script, style, and noscript content completely
1338
+ if element.parent.name in ['script', 'style', 'noscript']:
1339
+ continue
1340
+
1341
+ text = element.strip()
1342
+ if text and not self.should_skip_element(element, text):
1343
+ text_elements.append({
1344
+ 'text': text,
1345
+ 'element': element
1346
+ })
1347
+
1348
+ return soup, text_elements
1349
+
1350
  def validate_and_fix_html(self, html_text):
1351
  """Fix common HTML syntax errors after processing"""
1352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1353
  # Fix DOCTYPE
1354
  html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
1355
 
1356
+ # Fix spacing issues
1357
+ html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags
1358
+ html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing >
1359
+ html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening <
1360
 
1361
  # Fix common word errors that might occur during processing
1362
  html_text = html_text.replace('down loaded', 'downloaded')
1363
  html_text = html_text.replace('But your document', 'Your document')
1364
 
 
 
 
 
 
 
 
1365
  return html_text
1366
 
1367
  def add_natural_flow_variations(self, text):
 
1426
 
1427
  return ' '.join(enhanced_sentences)
1428
 
1429
+ def process_html(self, html_content, progress_callback=None):
1430
+ """Main processing function with progress callback"""
1431
+ if not html_content.strip():
1432
+ return "Please provide HTML content."
1433
+
1434
+ # Store all script and style content to preserve it
1435
+ script_placeholder = "###SCRIPT_PLACEHOLDER_{}###"
1436
+ style_placeholder = "###STYLE_PLACEHOLDER_{}###"
1437
+ preserved_scripts = []
1438
+ preserved_styles = []
1439
+
1440
+ # Temporarily replace script and style tags with placeholders
1441
+ soup_temp = BeautifulSoup(html_content, 'html.parser')
1442
+
1443
+ # Preserve all script tags
1444
+ for idx, script in enumerate(soup_temp.find_all('script')):
1445
+ placeholder = script_placeholder.format(idx)
1446
+ preserved_scripts.append(str(script))
1447
+ script.replace_with(placeholder)
1448
+
1449
+ # Preserve all style tags
1450
+ for idx, style in enumerate(soup_temp.find_all('style')):
1451
+ placeholder = style_placeholder.format(idx)
1452
+ preserved_styles.append(str(style))
1453
+ style.replace_with(placeholder)
1454
+
1455
+ # Get the modified HTML
1456
+ html_content = str(soup_temp)
1457
+
1458
+ try:
1459
+ # Extract text elements
1460
+ soup, text_elements = self.extract_text_from_html(html_content)
1461
+
1462
+ total_elements = len(text_elements)
1463
+ print(f"Found {total_elements} text elements to process (after filtering)")
1464
+
1465
+ # Process each text element
1466
+ processed_count = 0
1467
+
1468
+ for i, element_info in enumerate(text_elements):
1469
+ original_text = element_info['text']
1470
+
1471
+ # Skip placeholders
1472
+ if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text:
1473
+ continue
1474
+
1475
+ # Skip very short texts
1476
+ if len(original_text.split()) < 3:
1477
+ continue
1478
+
1479
+ # First pass with Dipper
1480
+ paraphrased_text = self.paraphrase_with_dipper(
1481
+ original_text,
1482
+ lex_diversity=60,
1483
+ order_diversity=20
1484
+ )
1485
+
1486
+ # Second pass with BART for longer texts (balanced probability)
1487
+ if self.use_bart and len(paraphrased_text.split()) > 8:
1488
+ # 30% chance to use BART for more variation (balanced)
1489
+ if random.random() < 0.3:
1490
+ paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
1491
+
1492
+ # Apply sentence variation
1493
+ paraphrased_text = self.apply_sentence_variation(paraphrased_text)
1494
+
1495
+ # Add natural flow variations
1496
+ paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
1497
+
1498
+ # Fix punctuation and formatting
1499
+ paraphrased_text = self.fix_punctuation(paraphrased_text)
1500
+
1501
+ # Final quality check
1502
+ if paraphrased_text and len(paraphrased_text.split()) >= 3:
1503
+ element_info['element'].replace_with(NavigableString(paraphrased_text))
1504
+ processed_count += 1
1505
+
1506
+ # Progress update
1507
+ if progress_callback:
1508
+ progress_callback(i + 1, total_elements)
1509
+
1510
+ if i % 10 == 0 or i == total_elements - 1:
1511
+ progress = (i + 1) / total_elements * 100
1512
+ print(f"Progress: {progress:.1f}%")
1513
+
1514
+ # Get the processed HTML
1515
+ result = str(soup)
1516
+
1517
+ # Restore all script tags
1518
+ for idx, script_content in enumerate(preserved_scripts):
1519
+ placeholder = script_placeholder.format(idx)
1520
+ result = result.replace(placeholder, script_content)
1521
+
1522
+ # Restore all style tags
1523
+ for idx, style_content in enumerate(preserved_styles):
1524
+ placeholder = style_placeholder.format(idx)
1525
+ result = result.replace(placeholder, style_content)
1526
+
1527
+ # Post-process the entire HTML to fix bold/strong formatting
1528
+ result = self.post_process_html(result)
1529
+
1530
+ # Validate and fix HTML syntax
1531
+ result = self.validate_and_fix_html(result)
1532
+
1533
+ # Count skipped elements properly
1534
+ all_text_elements = soup.find_all(string=True)
1535
+ skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
1536
+
1537
+ print(f"Successfully processed {processed_count} text elements")
1538
+ print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)")
1539
+ print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags")
1540
+
1541
+ return result
1542
+
1543
+ except Exception as e:
1544
+ import traceback
1545
+ error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
1546
+ print(error_msg)
1547
+ # Return original HTML with error message prepended as HTML comment
1548
+ return f"<!-- {error_msg} -->\n{html_content}"
1549
+
1550
  def post_process_html(self, html_text):
1551
  """Post-process the entire HTML to fix formatting issues"""
1552
  # Fix empty angle brackets that might appear