Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 4, 2025

Commit

99d994a

verified ·

1 Parent(s): 0bc69df

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -263

app.py CHANGED Viewed

@@ -666,17 +666,12 @@ class EnhancedDipperHumanizer:
         """Determine if an element should be skipped from paraphrasing"""
         if not text or len(text.strip()) < 3:
             return True
-        # Skip JavaScript code inside script tags - CRITICAL FIX
         parent = element.parent
         if parent and parent.name in ['script', 'style', 'noscript']:
             return True
-        # Also check if we're inside a script tag at any level
-        for ancestor in element.parents:
-            if ancestor.name in ['script', 'style', 'noscript']:
-                return True
         # Skip headings (h1-h6)
         if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
             return True
@@ -792,237 +787,8 @@ class EnhancedDipperHumanizer:
                     # But don't skip if it's inside a paragraph
                     if not any(p.name == 'p' for p in parent.parents):
                         return True
         return False
-    def extract_text_from_html(self, html_content):
-        """Extract text elements from HTML with skip logic"""
-        soup = BeautifulSoup(html_content, 'html.parser')
-        text_elements = []
-        # CRITICAL: Preserve all script tags completely
-        script_tags = soup.find_all('script')
-        script_placeholders = {}
-        for i, script in enumerate(script_tags):
-            placeholder = f"###SCRIPT_CONTENT_{i}###"
-            script_placeholders[placeholder] = str(script)
-            script.string = placeholder
-        # Get all text nodes
-        for element in soup.find_all(string=True):
-            # Skip script, style, and noscript content completely
-            if element.parent.name in ['script', 'style', 'noscript']:
-                continue
-            # Skip if it's a script placeholder
-            text = element.strip()
-            if text.startswith("###SCRIPT_CONTENT_") and text.endswith("###"):
-                continue
-            if text and not self.should_skip_element(element, text):
-                text_elements.append({
-                    'text': text,
-                    'element': element
-                })
-        return soup, text_elements, script_placeholders
-    def process_html(self, html_content, progress_callback=None):
-        """Main processing function with progress callback"""
-        if not html_content.strip():
-            return "Please provide HTML content."
-        # Parse the HTML first
-        soup_initial = BeautifulSoup(html_content, 'html.parser')
-        # Store ALL scripts (both external and inline) with their full content
-        script_storage = []
-        script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
-        # Find and replace all script tags
-        for idx, script in enumerate(soup_initial.find_all('script')):
-            placeholder = script_placeholder_template.format(idx)
-            # Store the entire script tag as a string
-            script_storage.append(str(script))
-            # Replace with a comment placeholder
-            new_tag = soup_initial.new_string(placeholder)
-            script.replace_with(new_tag)
-        # Also store and replace style tags
-        style_storage = []
-        style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
-        for idx, style in enumerate(soup_initial.find_all('style')):
-            placeholder = style_placeholder_template.format(idx)
-            style_storage.append(str(style))
-            new_tag = soup_initial.new_string(placeholder)
-            style.replace_with(new_tag)
-        # Get the modified HTML
-        html_content = str(soup_initial)
-        try:
-            # Extract text elements
-            soup, text_elements = self.extract_text_from_html(html_content)
-            total_elements = len(text_elements)
-            print(f"Found {total_elements} text elements to process (after filtering)")
-            # Process each text element
-            processed_count = 0
-            for i, element_info in enumerate(text_elements):
-                original_text = element_info['text']
-                # Skip placeholders
-                if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
-                    continue
-                # Skip very short texts
-                if len(original_text.split()) < 3:
-                    continue
-                # First pass with Dipper
-                paraphrased_text = self.paraphrase_with_dipper(
-                    original_text,
-                    lex_diversity=60,
-                    order_diversity=20
-                )
-                # Second pass with BART for longer texts (balanced probability)
-                if self.use_bart and len(paraphrased_text.split()) > 8:
-                    # 30% chance to use BART for more variation (balanced)
-                    if random.random() < 0.3:
-                        paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
-                # Apply sentence variation
-                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
-                # Add natural flow variations
-                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
-                # Fix punctuation and formatting
-                paraphrased_text = self.fix_punctuation(paraphrased_text)
-                # Final quality check
-                if paraphrased_text and len(paraphrased_text.split()) >= 3:
-                    element_info['element'].replace_with(NavigableString(paraphrased_text))
-                    processed_count += 1
-                # Progress update
-                if progress_callback:
-                    progress_callback(i + 1, total_elements)
-                if i % 10 == 0 or i == total_elements - 1:
-                    progress = (i + 1) / total_elements * 100
-                    print(f"Progress: {progress:.1f}%")
-            # Get the processed HTML
-            result = str(soup)
-            # Restore all script tags exactly as they were
-            for idx, script_content in enumerate(script_storage):
-                placeholder = script_placeholder_template.format(idx)
-                result = result.replace(placeholder, script_content)
-            # Restore all style tags exactly as they were
-            for idx, style_content in enumerate(style_storage):
-                placeholder = style_placeholder_template.format(idx)
-                result = result.replace(placeholder, style_content)
-            # Post-process the entire HTML to fix bold/strong formatting
-            result = self.post_process_html(result)
-            # Validate and fix HTML syntax (but protect scripts)
-            result = self.validate_and_fix_html_safe(result)
-            print(f"Successfully processed {processed_count} text elements")
-            print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
-            return result
-        except Exception as e:
-            import traceback
-            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            # Return original HTML with error message prepended as HTML comment
-            return f"<!-- {error_msg} -->\n{html_content}"
-    def validate_and_fix_html_safe(self, html_text):
-        """Fix common HTML syntax errors after processing while protecting scripts"""
-        # First, extract and protect script content
-        script_pattern = r'<script[^>]*>.*?</script>'
-        scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
-        script_placeholders = {}
-        for i, script_content in enumerate(scripts):
-            placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
-            script_placeholders[placeholder] = script_content
-            html_text = html_text.replace(script_content, placeholder, 1)
-        # Also protect style tags
-        style_pattern = r'<style[^>]*>.*?</style>'
-        styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
-        style_placeholders = {}
-        for i, style_content in enumerate(styles):
-            placeholder = f"<!--PROTECTED_STYLE_{i}-->"
-            style_placeholders[placeholder] = style_content
-            html_text = html_text.replace(style_content, placeholder, 1)
-        # Fix DOCTYPE
-        html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
-        # Fix spacing issues (but not inside scripts/styles)
-        html_text = re.sub(r'>\s+<', '><', html_text)
-        html_text = re.sub(r'\s+>', '>', html_text)
-        html_text = re.sub(r'<\s+', '<', html_text)
-        # Fix common word errors that might occur during processing
-        html_text = html_text.replace('down loaded', 'downloaded')
-        html_text = html_text.replace('But your document', 'Your document')
-        # Restore protected scripts
-        for placeholder, script_content in script_placeholders.items():
-            html_text = html_text.replace(placeholder, script_content)
-        # Restore protected styles
-        for placeholder, style_content in style_placeholders.items():
-            html_text = html_text.replace(placeholder, style_content)
-        return html_text
-    def extract_text_from_html(self, html_content):
-        """Extract text elements from HTML with skip logic"""
-        soup = BeautifulSoup(html_content, 'html.parser')
-        text_elements = []
-        # Get all text nodes using strings (the correct method)
-        for element in soup.strings:
-            # Skip if parent is script, style, or noscript
-            if element.parent.name in ['script', 'style', 'noscript']:
-                continue
-            text = element.strip()
-            # Skip empty strings
-            if not text:
-                continue
-            # Skip placeholder texts
-            if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
-                continue
-            # Use the existing should_skip_element logic
-            if not self.should_skip_element(element, text):
-                text_elements.append({
-                    'text': text,
-                    'element': element
-                })
-        return soup, text_elements
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""
@@ -1561,42 +1327,41 @@ class EnhancedDipperHumanizer:
         return text
     def validate_and_fix_html(self, html_text):
         """Fix common HTML syntax errors after processing"""
-        # First, protect script content
-        script_pattern = r'<script[^>]*>(.*?)</script>'
-        scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
-        script_placeholders = {}
-        for i, script_content in enumerate(scripts):
-            placeholder = f"<!--SCRIPT_PLACEHOLDER_{i}-->"
-            script_placeholders[placeholder] = script_content
-            html_text = html_text.replace(
-                f'<script>{script_content}</script>',
-                f'<script>{placeholder}</script>',
-                1
-            )
         # Fix DOCTYPE
         html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
-        # Fix spacing issues (but not inside scripts)
-        html_text = re.sub(r'>\s+<', '><', html_text)
-        html_text = re.sub(r'\s+>', '>', html_text)
-        html_text = re.sub(r'<\s+', '<', html_text)
         # Fix common word errors that might occur during processing
         html_text = html_text.replace('down loaded', 'downloaded')
         html_text = html_text.replace('But your document', 'Your document')
-        # Restore script content
-        for placeholder, script_content in script_placeholders.items():
-            html_text = html_text.replace(
-                f'<script>{placeholder}</script>',
-                f'<script>{script_content}</script>'
-            )
         return html_text
     def add_natural_flow_variations(self, text):
@@ -1661,6 +1426,127 @@ class EnhancedDipperHumanizer:
         return ' '.join(enhanced_sentences)
     def post_process_html(self, html_text):
         """Post-process the entire HTML to fix formatting issues"""
         # Fix empty angle brackets that might appear

         """Determine if an element should be skipped from paraphrasing"""
         if not text or len(text.strip()) < 3:
             return True
+        # Skip JavaScript code inside script tags
         parent = element.parent
         if parent and parent.name in ['script', 'style', 'noscript']:
             return True
         # Skip headings (h1-h6)
         if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
             return True
                     # But don't skip if it's inside a paragraph
                     if not any(p.name == 'p' for p in parent.parents):
                         return True
         return False
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""
         return text
+    def extract_text_from_html(self, html_content):
+        """Extract text elements from HTML with skip logic"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        text_elements = []
+        # Get all text nodes using string instead of text (fixing deprecation)
+        for element in soup.find_all(string=True):
+            # Skip script, style, and noscript content completely
+            if element.parent.name in ['script', 'style', 'noscript']:
+                continue
+            text = element.strip()
+            if text and not self.should_skip_element(element, text):
+                text_elements.append({
+                    'text': text,
+                    'element': element
+                })
+        return soup, text_elements
     def validate_and_fix_html(self, html_text):
         """Fix common HTML syntax errors after processing"""
         # Fix DOCTYPE
         html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+        # Fix spacing issues
+        html_text = re.sub(r'>\s+<', '><', html_text)  # Remove extra spaces between tags
+        html_text = re.sub(r'\s+>', '>', html_text)  # Remove spaces before closing >
+        html_text = re.sub(r'<\s+', '<', html_text)  # Remove spaces after opening <
         # Fix common word errors that might occur during processing
         html_text = html_text.replace('down loaded', 'downloaded')
         html_text = html_text.replace('But your document', 'Your document')
         return html_text
     def add_natural_flow_variations(self, text):
         return ' '.join(enhanced_sentences)
+    def process_html(self, html_content, progress_callback=None):
+        """Main processing function with progress callback"""
+        if not html_content.strip():
+            return "Please provide HTML content."
+        # Store all script and style content to preserve it
+        script_placeholder = "###SCRIPT_PLACEHOLDER_{}###"
+        style_placeholder = "###STYLE_PLACEHOLDER_{}###"
+        preserved_scripts = []
+        preserved_styles = []
+        # Temporarily replace script and style tags with placeholders
+        soup_temp = BeautifulSoup(html_content, 'html.parser')
+        # Preserve all script tags
+        for idx, script in enumerate(soup_temp.find_all('script')):
+            placeholder = script_placeholder.format(idx)
+            preserved_scripts.append(str(script))
+            script.replace_with(placeholder)
+        # Preserve all style tags
+        for idx, style in enumerate(soup_temp.find_all('style')):
+            placeholder = style_placeholder.format(idx)
+            preserved_styles.append(str(style))
+            style.replace_with(placeholder)
+        # Get the modified HTML
+        html_content = str(soup_temp)
+        try:
+            # Extract text elements
+            soup, text_elements = self.extract_text_from_html(html_content)
+            total_elements = len(text_elements)
+            print(f"Found {total_elements} text elements to process (after filtering)")
+            # Process each text element
+            processed_count = 0
+            for i, element_info in enumerate(text_elements):
+                original_text = element_info['text']
+                # Skip placeholders
+                if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text:
+                    continue
+                # Skip very short texts
+                if len(original_text.split()) < 3:
+                    continue
+                # First pass with Dipper
+                paraphrased_text = self.paraphrase_with_dipper(
+                    original_text,
+                    lex_diversity=60,
+                    order_diversity=20
+                )
+                # Second pass with BART for longer texts (balanced probability)
+                if self.use_bart and len(paraphrased_text.split()) > 8:
+                    # 30% chance to use BART for more variation (balanced)
+                    if random.random() < 0.3:
+                        paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
+                # Apply sentence variation
+                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+                # Add natural flow variations
+                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+                # Fix punctuation and formatting
+                paraphrased_text = self.fix_punctuation(paraphrased_text)
+                # Final quality check
+                if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                    element_info['element'].replace_with(NavigableString(paraphrased_text))
+                    processed_count += 1
+                # Progress update
+                if progress_callback:
+                    progress_callback(i + 1, total_elements)
+                if i % 10 == 0 or i == total_elements - 1:
+                    progress = (i + 1) / total_elements * 100
+                    print(f"Progress: {progress:.1f}%")
+            # Get the processed HTML
+            result = str(soup)
+            # Restore all script tags
+            for idx, script_content in enumerate(preserved_scripts):
+                placeholder = script_placeholder.format(idx)
+                result = result.replace(placeholder, script_content)
+            # Restore all style tags
+            for idx, style_content in enumerate(preserved_styles):
+                placeholder = style_placeholder.format(idx)
+                result = result.replace(placeholder, style_content)
+            # Post-process the entire HTML to fix bold/strong formatting
+            result = self.post_process_html(result)
+            # Validate and fix HTML syntax
+            result = self.validate_and_fix_html(result)
+            # Count skipped elements properly
+            all_text_elements = soup.find_all(string=True)
+            skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
+            print(f"Successfully processed {processed_count} text elements")
+            print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)")
+            print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags")
+            return result
+        except Exception as e:
+            import traceback
+            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            # Return original HTML with error message prepended as HTML comment
+            return f"<!-- {error_msg} -->\n{html_content}"
     def post_process_html(self, html_text):
         """Post-process the entire HTML to fix formatting issues"""
         # Fix empty angle brackets that might appear