EdysorEdutech commited on
Commit
0bc69df
·
verified ·
1 Parent(s): afd7422

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -169
app.py CHANGED
@@ -829,200 +829,200 @@ class EnhancedDipperHumanizer:
829
  return soup, text_elements, script_placeholders
830
 
831
  def process_html(self, html_content, progress_callback=None):
832
- """Main processing function with progress callback"""
833
- if not html_content.strip():
834
- return "Please provide HTML content."
835
-
836
- # Parse the HTML first
837
- soup_initial = BeautifulSoup(html_content, 'html.parser')
838
-
839
- # Store ALL scripts (both external and inline) with their full content
840
- script_storage = []
841
- script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
842
-
843
- # Find and replace all script tags
844
- for idx, script in enumerate(soup_initial.find_all('script')):
845
- placeholder = script_placeholder_template.format(idx)
846
- # Store the entire script tag as a string
847
- script_storage.append(str(script))
848
- # Replace with a comment placeholder
849
- new_tag = soup_initial.new_string(placeholder)
850
- script.replace_with(new_tag)
851
-
852
- # Also store and replace style tags
853
- style_storage = []
854
- style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
855
-
856
- for idx, style in enumerate(soup_initial.find_all('style')):
857
- placeholder = style_placeholder_template.format(idx)
858
- style_storage.append(str(style))
859
- new_tag = soup_initial.new_string(placeholder)
860
- style.replace_with(new_tag)
861
-
862
- # Get the modified HTML
863
- html_content = str(soup_initial)
864
-
865
- try:
866
- # Extract text elements
867
- soup, text_elements = self.extract_text_from_html(html_content)
868
 
869
- total_elements = len(text_elements)
870
- print(f"Found {total_elements} text elements to process (after filtering)")
 
871
 
872
- # Process each text element
873
- processed_count = 0
 
 
 
 
 
 
874
 
875
- for i, element_info in enumerate(text_elements):
876
- original_text = element_info['text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877
 
878
- # Skip placeholders
879
- if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
880
- continue
881
 
882
- # Skip very short texts
883
- if len(original_text.split()) < 3:
884
- continue
885
 
886
- # First pass with Dipper
887
- paraphrased_text = self.paraphrase_with_dipper(
888
- original_text,
889
- lex_diversity=60,
890
- order_diversity=20
891
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
 
893
- # Second pass with BART for longer texts (balanced probability)
894
- if self.use_bart and len(paraphrased_text.split()) > 8:
895
- # 30% chance to use BART for more variation (balanced)
896
- if random.random() < 0.3:
897
- paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
898
 
899
- # Apply sentence variation
900
- paraphrased_text = self.apply_sentence_variation(paraphrased_text)
 
 
901
 
902
- # Add natural flow variations
903
- paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
 
 
904
 
905
- # Fix punctuation and formatting
906
- paraphrased_text = self.fix_punctuation(paraphrased_text)
907
 
908
- # Final quality check
909
- if paraphrased_text and len(paraphrased_text.split()) >= 3:
910
- element_info['element'].replace_with(NavigableString(paraphrased_text))
911
- processed_count += 1
912
 
913
- # Progress update
914
- if progress_callback:
915
- progress_callback(i + 1, total_elements)
 
916
 
917
- if i % 10 == 0 or i == total_elements - 1:
918
- progress = (i + 1) / total_elements * 100
919
- print(f"Progress: {progress:.1f}%")
 
 
 
 
 
 
920
 
921
- # Get the processed HTML
922
- result = str(soup)
 
 
923
 
924
- # Restore all script tags exactly as they were
925
- for idx, script_content in enumerate(script_storage):
926
- placeholder = script_placeholder_template.format(idx)
927
- result = result.replace(placeholder, script_content)
928
 
929
- # Restore all style tags exactly as they were
930
- for idx, style_content in enumerate(style_storage):
931
- placeholder = style_placeholder_template.format(idx)
932
- result = result.replace(placeholder, style_content)
933
 
934
- # Post-process the entire HTML to fix bold/strong formatting
935
- result = self.post_process_html(result)
 
 
936
 
937
- # Validate and fix HTML syntax (but protect scripts)
938
- result = self.validate_and_fix_html_safe(result)
939
 
940
- print(f"Successfully processed {processed_count} text elements")
941
- print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
 
 
942
 
943
- return result
 
 
944
 
945
- except Exception as e:
946
- import traceback
947
- error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
948
- print(error_msg)
949
- # Return original HTML with error message prepended as HTML comment
950
- return f"<!-- {error_msg} -->\n{html_content}"
951
-
952
- def validate_and_fix_html_safe(self, html_text):
953
- """Fix common HTML syntax errors after processing while protecting scripts"""
954
-
955
- # First, extract and protect script content
956
- script_pattern = r'<script[^>]*>.*?</script>'
957
- scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
958
- script_placeholders = {}
959
-
960
- for i, script_content in enumerate(scripts):
961
- placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
962
- script_placeholders[placeholder] = script_content
963
- html_text = html_text.replace(script_content, placeholder, 1)
964
-
965
- # Also protect style tags
966
- style_pattern = r'<style[^>]*>.*?</style>'
967
- styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
968
- style_placeholders = {}
969
-
970
- for i, style_content in enumerate(styles):
971
- placeholder = f"<!--PROTECTED_STYLE_{i}-->"
972
- style_placeholders[placeholder] = style_content
973
- html_text = html_text.replace(style_content, placeholder, 1)
974
-
975
- # Fix DOCTYPE
976
- html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
977
-
978
- # Fix spacing issues (but not inside scripts/styles)
979
- html_text = re.sub(r'>\s+<', '><', html_text)
980
- html_text = re.sub(r'\s+>', '>', html_text)
981
- html_text = re.sub(r'<\s+', '<', html_text)
982
-
983
- # Fix common word errors that might occur during processing
984
- html_text = html_text.replace('down loaded', 'downloaded')
985
- html_text = html_text.replace('But your document', 'Your document')
986
-
987
- # Restore protected scripts
988
- for placeholder, script_content in script_placeholders.items():
989
- html_text = html_text.replace(placeholder, script_content)
990
-
991
- # Restore protected styles
992
- for placeholder, style_content in style_placeholders.items():
993
- html_text = html_text.replace(placeholder, style_content)
994
-
995
- return html_text
996
 
997
  def extract_text_from_html(self, html_content):
998
- """Extract text elements from HTML with skip logic"""
999
- soup = BeautifulSoup(html_content, 'html.parser')
1000
- text_elements = []
1001
-
1002
- # Get all text nodes using strings (the correct method)
1003
- for element in soup.strings:
1004
- # Skip if parent is script, style, or noscript
1005
- if element.parent.name in ['script', 'style', 'noscript']:
1006
- continue
1007
 
1008
- text = element.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
 
1010
- # Skip empty strings
1011
- if not text:
1012
- continue
1013
-
1014
- # Skip placeholder texts
1015
- if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
1016
- continue
1017
-
1018
- # Use the existing should_skip_element logic
1019
- if not self.should_skip_element(element, text):
1020
- text_elements.append({
1021
- 'text': text,
1022
- 'element': element
1023
- })
1024
-
1025
- return soup, text_elements
1026
 
1027
  def is_likely_acronym_or_proper_noun(self, word):
1028
  """Check if a word is likely an acronym or part of a proper noun"""
 
829
  return soup, text_elements, script_placeholders
830
 
831
  def process_html(self, html_content, progress_callback=None):
832
+ """Main processing function with progress callback"""
833
+ if not html_content.strip():
834
+ return "Please provide HTML content."
835
+
836
+ # Parse the HTML first
837
+ soup_initial = BeautifulSoup(html_content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
 
839
+ # Store ALL scripts (both external and inline) with their full content
840
+ script_storage = []
841
+ script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
842
 
843
+ # Find and replace all script tags
844
+ for idx, script in enumerate(soup_initial.find_all('script')):
845
+ placeholder = script_placeholder_template.format(idx)
846
+ # Store the entire script tag as a string
847
+ script_storage.append(str(script))
848
+ # Replace with a comment placeholder
849
+ new_tag = soup_initial.new_string(placeholder)
850
+ script.replace_with(new_tag)
851
 
852
+ # Also store and replace style tags
853
+ style_storage = []
854
+ style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
855
+
856
+ for idx, style in enumerate(soup_initial.find_all('style')):
857
+ placeholder = style_placeholder_template.format(idx)
858
+ style_storage.append(str(style))
859
+ new_tag = soup_initial.new_string(placeholder)
860
+ style.replace_with(new_tag)
861
+
862
+ # Get the modified HTML
863
+ html_content = str(soup_initial)
864
+
865
+ try:
866
+ # Extract text elements
867
+ soup, text_elements = self.extract_text_from_html(html_content)
868
 
869
+ total_elements = len(text_elements)
870
+ print(f"Found {total_elements} text elements to process (after filtering)")
 
871
 
872
+ # Process each text element
873
+ processed_count = 0
 
874
 
875
+ for i, element_info in enumerate(text_elements):
876
+ original_text = element_info['text']
877
+
878
+ # Skip placeholders
879
+ if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
880
+ continue
881
+
882
+ # Skip very short texts
883
+ if len(original_text.split()) < 3:
884
+ continue
885
+
886
+ # First pass with Dipper
887
+ paraphrased_text = self.paraphrase_with_dipper(
888
+ original_text,
889
+ lex_diversity=60,
890
+ order_diversity=20
891
+ )
892
+
893
+ # Second pass with BART for longer texts (balanced probability)
894
+ if self.use_bart and len(paraphrased_text.split()) > 8:
895
+ # 30% chance to use BART for more variation (balanced)
896
+ if random.random() < 0.3:
897
+ paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
898
+
899
+ # Apply sentence variation
900
+ paraphrased_text = self.apply_sentence_variation(paraphrased_text)
901
+
902
+ # Add natural flow variations
903
+ paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
904
+
905
+ # Fix punctuation and formatting
906
+ paraphrased_text = self.fix_punctuation(paraphrased_text)
907
+
908
+ # Final quality check
909
+ if paraphrased_text and len(paraphrased_text.split()) >= 3:
910
+ element_info['element'].replace_with(NavigableString(paraphrased_text))
911
+ processed_count += 1
912
+
913
+ # Progress update
914
+ if progress_callback:
915
+ progress_callback(i + 1, total_elements)
916
+
917
+ if i % 10 == 0 or i == total_elements - 1:
918
+ progress = (i + 1) / total_elements * 100
919
+ print(f"Progress: {progress:.1f}%")
920
 
921
+ # Get the processed HTML
922
+ result = str(soup)
 
 
 
923
 
924
+ # Restore all script tags exactly as they were
925
+ for idx, script_content in enumerate(script_storage):
926
+ placeholder = script_placeholder_template.format(idx)
927
+ result = result.replace(placeholder, script_content)
928
 
929
+ # Restore all style tags exactly as they were
930
+ for idx, style_content in enumerate(style_storage):
931
+ placeholder = style_placeholder_template.format(idx)
932
+ result = result.replace(placeholder, style_content)
933
 
934
+ # Post-process the entire HTML to fix bold/strong formatting
935
+ result = self.post_process_html(result)
936
 
937
+ # Validate and fix HTML syntax (but protect scripts)
938
+ result = self.validate_and_fix_html_safe(result)
 
 
939
 
940
+ print(f"Successfully processed {processed_count} text elements")
941
+ print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
942
+
943
+ return result
944
 
945
+ except Exception as e:
946
+ import traceback
947
+ error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
948
+ print(error_msg)
949
+ # Return original HTML with error message prepended as HTML comment
950
+ return f"<!-- {error_msg} -->\n{html_content}"
951
+
952
+ def validate_and_fix_html_safe(self, html_text):
953
+ """Fix common HTML syntax errors after processing while protecting scripts"""
954
 
955
+ # First, extract and protect script content
956
+ script_pattern = r'<script[^>]*>.*?</script>'
957
+ scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
958
+ script_placeholders = {}
959
 
960
+ for i, script_content in enumerate(scripts):
961
+ placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
962
+ script_placeholders[placeholder] = script_content
963
+ html_text = html_text.replace(script_content, placeholder, 1)
964
 
965
+ # Also protect style tags
966
+ style_pattern = r'<style[^>]*>.*?</style>'
967
+ styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
968
+ style_placeholders = {}
969
 
970
+ for i, style_content in enumerate(styles):
971
+ placeholder = f"<!--PROTECTED_STYLE_{i}-->"
972
+ style_placeholders[placeholder] = style_content
973
+ html_text = html_text.replace(style_content, placeholder, 1)
974
 
975
+ # Fix DOCTYPE
976
+ html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
977
 
978
+ # Fix spacing issues (but not inside scripts/styles)
979
+ html_text = re.sub(r'>\s+<', '><', html_text)
980
+ html_text = re.sub(r'\s+>', '>', html_text)
981
+ html_text = re.sub(r'<\s+', '<', html_text)
982
 
983
+ # Fix common word errors that might occur during processing
984
+ html_text = html_text.replace('down loaded', 'downloaded')
985
+ html_text = html_text.replace('But your document', 'Your document')
986
 
987
+ # Restore protected scripts
988
+ for placeholder, script_content in script_placeholders.items():
989
+ html_text = html_text.replace(placeholder, script_content)
990
+
991
+ # Restore protected styles
992
+ for placeholder, style_content in style_placeholders.items():
993
+ html_text = html_text.replace(placeholder, style_content)
994
+
995
+ return html_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
996
 
997
  def extract_text_from_html(self, html_content):
998
+ """Extract text elements from HTML with skip logic"""
999
+ soup = BeautifulSoup(html_content, 'html.parser')
1000
+ text_elements = []
 
 
 
 
 
 
1001
 
1002
+ # Get all text nodes using strings (the correct method)
1003
+ for element in soup.strings:
1004
+ # Skip if parent is script, style, or noscript
1005
+ if element.parent.name in ['script', 'style', 'noscript']:
1006
+ continue
1007
+
1008
+ text = element.strip()
1009
+
1010
+ # Skip empty strings
1011
+ if not text:
1012
+ continue
1013
+
1014
+ # Skip placeholder texts
1015
+ if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
1016
+ continue
1017
+
1018
+ # Use the existing should_skip_element logic
1019
+ if not self.should_skip_element(element, text):
1020
+ text_elements.append({
1021
+ 'text': text,
1022
+ 'element': element
1023
+ })
1024
 
1025
+ return soup, text_elements
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1026
 
1027
  def is_likely_acronym_or_proper_noun(self, word):
1028
  """Check if a word is likely an acronym or part of a proper noun"""