EdysorEdutech commited on
Commit
afd7422
·
verified ·
1 Parent(s): dcfc371

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -56
app.py CHANGED
@@ -829,71 +829,200 @@ class EnhancedDipperHumanizer:
829
  return soup, text_elements, script_placeholders
830
 
831
  def process_html(self, html_content, progress_callback=None):
832
- """Main processing function with progress callback"""
833
- if not html_content.strip():
834
- return "Please provide HTML content."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835
 
836
- try:
837
- # Extract text elements with script preservation
838
- soup, text_elements, script_placeholders = self.extract_text_from_html(html_content)
 
 
 
 
 
839
 
840
- total_elements = len(text_elements)
841
- print(f"Found {total_elements} text elements to process (after filtering)")
 
842
 
843
- # Process each text element
844
- processed_count = 0
 
845
 
846
- for i, element_info in enumerate(text_elements):
847
- original_text = element_info['text']
848
-
849
- # Skip placeholders
850
- if "###SCRIPT_" in original_text:
851
- continue
852
-
853
- # Skip very short texts
854
- if len(original_text.split()) < 3:
855
- continue
856
-
857
- # Process the text with your existing logic
858
- paraphrased_text = self.paraphrase_with_dipper(
859
- original_text,
860
- lex_diversity=60,
861
- order_diversity=20
862
- )
863
-
864
- # Apply other transformations...
865
- paraphrased_text = self.apply_sentence_variation(paraphrased_text)
866
- paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
867
- paraphrased_text = self.fix_punctuation(paraphrased_text)
868
-
869
- # Final quality check
870
- if paraphrased_text and len(paraphrased_text.split()) >= 3:
871
- element_info['element'].replace_with(NavigableString(paraphrased_text))
872
- processed_count += 1
873
-
874
- # Progress update
875
- if progress_callback:
876
- progress_callback(i + 1, total_elements)
877
 
878
- # Get the processed HTML
879
- result_html = str(soup)
 
 
 
880
 
881
- # CRITICAL: Restore all script content exactly as it was
882
- for placeholder, original_script in script_placeholders.items():
883
- result_html = result_html.replace(f"<script>{placeholder}</script>", original_script)
884
 
885
- # Post-process the entire HTML
886
- result_html = self.post_process_html(result_html)
887
- result_html = self.validate_and_fix_html(result_html)
888
 
889
- print(f"Successfully processed {processed_count} text elements")
890
- return result_html
891
 
892
- except Exception as e:
893
- import traceback
894
- error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
895
- print(error_msg)
896
- return f"<!-- {error_msg} -->\n{html_content}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
 
898
  def is_likely_acronym_or_proper_noun(self, word):
899
  """Check if a word is likely an acronym or part of a proper noun"""
 
829
  return soup, text_elements, script_placeholders
830
 
831
  def process_html(self, html_content, progress_callback=None):
832
+ """Main processing function with progress callback"""
833
+ if not html_content.strip():
834
+ return "Please provide HTML content."
835
+
836
+ # Parse the HTML first
837
+ soup_initial = BeautifulSoup(html_content, 'html.parser')
838
+
839
+ # Store ALL scripts (both external and inline) with their full content
840
+ script_storage = []
841
+ script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
842
+
843
+ # Find and replace all script tags
844
+ for idx, script in enumerate(soup_initial.find_all('script')):
845
+ placeholder = script_placeholder_template.format(idx)
846
+ # Store the entire script tag as a string
847
+ script_storage.append(str(script))
848
+ # Replace with a comment placeholder
849
+ new_tag = soup_initial.new_string(placeholder)
850
+ script.replace_with(new_tag)
851
+
852
+ # Also store and replace style tags
853
+ style_storage = []
854
+ style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
855
+
856
+ for idx, style in enumerate(soup_initial.find_all('style')):
857
+ placeholder = style_placeholder_template.format(idx)
858
+ style_storage.append(str(style))
859
+ new_tag = soup_initial.new_string(placeholder)
860
+ style.replace_with(new_tag)
861
+
862
+ # Get the modified HTML
863
+ html_content = str(soup_initial)
864
+
865
+ try:
866
+ # Extract text elements
867
+ soup, text_elements = self.extract_text_from_html(html_content)
868
 
869
+ total_elements = len(text_elements)
870
+ print(f"Found {total_elements} text elements to process (after filtering)")
871
+
872
+ # Process each text element
873
+ processed_count = 0
874
+
875
+ for i, element_info in enumerate(text_elements):
876
+ original_text = element_info['text']
877
 
878
+ # Skip placeholders
879
+ if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
880
+ continue
881
 
882
+ # Skip very short texts
883
+ if len(original_text.split()) < 3:
884
+ continue
885
 
886
+ # First pass with Dipper
887
+ paraphrased_text = self.paraphrase_with_dipper(
888
+ original_text,
889
+ lex_diversity=60,
890
+ order_diversity=20
891
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892
 
893
+ # Second pass with BART for longer texts (balanced probability)
894
+ if self.use_bart and len(paraphrased_text.split()) > 8:
895
+ # 30% chance to use BART for more variation (balanced)
896
+ if random.random() < 0.3:
897
+ paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
898
 
899
+ # Apply sentence variation
900
+ paraphrased_text = self.apply_sentence_variation(paraphrased_text)
 
901
 
902
+ # Add natural flow variations
903
+ paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
 
904
 
905
+ # Fix punctuation and formatting
906
+ paraphrased_text = self.fix_punctuation(paraphrased_text)
907
 
908
+ # Final quality check
909
+ if paraphrased_text and len(paraphrased_text.split()) >= 3:
910
+ element_info['element'].replace_with(NavigableString(paraphrased_text))
911
+ processed_count += 1
912
+
913
+ # Progress update
914
+ if progress_callback:
915
+ progress_callback(i + 1, total_elements)
916
+
917
+ if i % 10 == 0 or i == total_elements - 1:
918
+ progress = (i + 1) / total_elements * 100
919
+ print(f"Progress: {progress:.1f}%")
920
+
921
+ # Get the processed HTML
922
+ result = str(soup)
923
+
924
+ # Restore all script tags exactly as they were
925
+ for idx, script_content in enumerate(script_storage):
926
+ placeholder = script_placeholder_template.format(idx)
927
+ result = result.replace(placeholder, script_content)
928
+
929
+ # Restore all style tags exactly as they were
930
+ for idx, style_content in enumerate(style_storage):
931
+ placeholder = style_placeholder_template.format(idx)
932
+ result = result.replace(placeholder, style_content)
933
+
934
+ # Post-process the entire HTML to fix bold/strong formatting
935
+ result = self.post_process_html(result)
936
+
937
+ # Validate and fix HTML syntax (but protect scripts)
938
+ result = self.validate_and_fix_html_safe(result)
939
+
940
+ print(f"Successfully processed {processed_count} text elements")
941
+ print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
942
+
943
+ return result
944
+
945
+ except Exception as e:
946
+ import traceback
947
+ error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
948
+ print(error_msg)
949
+ # Return original HTML with error message prepended as HTML comment
950
+ return f"<!-- {error_msg} -->\n{html_content}"
951
+
952
+ def validate_and_fix_html_safe(self, html_text):
953
+ """Fix common HTML syntax errors after processing while protecting scripts"""
954
+
955
+ # First, extract and protect script content
956
+ script_pattern = r'<script[^>]*>.*?</script>'
957
+ scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
958
+ script_placeholders = {}
959
+
960
+ for i, script_content in enumerate(scripts):
961
+ placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
962
+ script_placeholders[placeholder] = script_content
963
+ html_text = html_text.replace(script_content, placeholder, 1)
964
+
965
+ # Also protect style tags
966
+ style_pattern = r'<style[^>]*>.*?</style>'
967
+ styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
968
+ style_placeholders = {}
969
+
970
+ for i, style_content in enumerate(styles):
971
+ placeholder = f"<!--PROTECTED_STYLE_{i}-->"
972
+ style_placeholders[placeholder] = style_content
973
+ html_text = html_text.replace(style_content, placeholder, 1)
974
+
975
+ # Fix DOCTYPE
976
+ html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
977
+
978
+ # Fix spacing issues (but not inside scripts/styles)
979
+ html_text = re.sub(r'>\s+<', '><', html_text)
980
+ html_text = re.sub(r'\s+>', '>', html_text)
981
+ html_text = re.sub(r'<\s+', '<', html_text)
982
+
983
+ # Fix common word errors that might occur during processing
984
+ html_text = html_text.replace('down loaded', 'downloaded')
985
+ html_text = html_text.replace('But your document', 'Your document')
986
+
987
+ # Restore protected scripts
988
+ for placeholder, script_content in script_placeholders.items():
989
+ html_text = html_text.replace(placeholder, script_content)
990
+
991
+ # Restore protected styles
992
+ for placeholder, style_content in style_placeholders.items():
993
+ html_text = html_text.replace(placeholder, style_content)
994
+
995
+ return html_text
996
+
997
+ def extract_text_from_html(self, html_content):
998
+ """Extract text elements from HTML with skip logic"""
999
+ soup = BeautifulSoup(html_content, 'html.parser')
1000
+ text_elements = []
1001
+
1002
+ # Get all text nodes using strings (the correct method)
1003
+ for element in soup.strings:
1004
+ # Skip if parent is script, style, or noscript
1005
+ if element.parent.name in ['script', 'style', 'noscript']:
1006
+ continue
1007
+
1008
+ text = element.strip()
1009
+
1010
+ # Skip empty strings
1011
+ if not text:
1012
+ continue
1013
+
1014
+ # Skip placeholder texts
1015
+ if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
1016
+ continue
1017
+
1018
+ # Use the existing should_skip_element logic
1019
+ if not self.should_skip_element(element, text):
1020
+ text_elements.append({
1021
+ 'text': text,
1022
+ 'element': element
1023
+ })
1024
+
1025
+ return soup, text_elements
1026
 
1027
  def is_likely_acronym_or_proper_noun(self, word):
1028
  """Check if a word is likely an acronym or part of a proper noun"""