Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Aug 4, 2025

Commit

0bc69df

verified ·

1 Parent(s): afd7422

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -169

app.py CHANGED Viewed

@@ -829,200 +829,200 @@ class EnhancedDipperHumanizer:
         return soup, text_elements, script_placeholders
     def process_html(self, html_content, progress_callback=None):
-    """Main processing function with progress callback"""
-    if not html_content.strip():
-        return "Please provide HTML content."
-    # Parse the HTML first
-    soup_initial = BeautifulSoup(html_content, 'html.parser')
-    # Store ALL scripts (both external and inline) with their full content
-    script_storage = []
-    script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
-    # Find and replace all script tags
-    for idx, script in enumerate(soup_initial.find_all('script')):
-        placeholder = script_placeholder_template.format(idx)
-        # Store the entire script tag as a string
-        script_storage.append(str(script))
-        # Replace with a comment placeholder
-        new_tag = soup_initial.new_string(placeholder)
-        script.replace_with(new_tag)
-    # Also store and replace style tags
-    style_storage = []
-    style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
-    for idx, style in enumerate(soup_initial.find_all('style')):
-        placeholder = style_placeholder_template.format(idx)
-        style_storage.append(str(style))
-        new_tag = soup_initial.new_string(placeholder)
-        style.replace_with(new_tag)
-    # Get the modified HTML
-    html_content = str(soup_initial)
-    try:
-        # Extract text elements
-        soup, text_elements = self.extract_text_from_html(html_content)
-        total_elements = len(text_elements)
-        print(f"Found {total_elements} text elements to process (after filtering)")
-        # Process each text element
-        processed_count = 0
-        for i, element_info in enumerate(text_elements):
-            original_text = element_info['text']
-            # Skip placeholders
-            if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
-                continue
-            # Skip very short texts
-            if len(original_text.split()) < 3:
-                continue
-            # First pass with Dipper
-            paraphrased_text = self.paraphrase_with_dipper(
-                original_text,
-                lex_diversity=60,
-                order_diversity=20
-            )
-            # Second pass with BART for longer texts (balanced probability)
-            if self.use_bart and len(paraphrased_text.split()) > 8:
-                # 30% chance to use BART for more variation (balanced)
-                if random.random() < 0.3:
-                    paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
-            # Apply sentence variation
-            paraphrased_text = self.apply_sentence_variation(paraphrased_text)
-            # Add natural flow variations
-            paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
-            # Fix punctuation and formatting
-            paraphrased_text = self.fix_punctuation(paraphrased_text)
-            # Final quality check
-            if paraphrased_text and len(paraphrased_text.split()) >= 3:
-                element_info['element'].replace_with(NavigableString(paraphrased_text))
-                processed_count += 1
-            # Progress update
-            if progress_callback:
-                progress_callback(i + 1, total_elements)
-            if i % 10 == 0 or i == total_elements - 1:
-                progress = (i + 1) / total_elements * 100
-                print(f"Progress: {progress:.1f}%")
-        # Get the processed HTML
-        result = str(soup)
-        # Restore all script tags exactly as they were
-        for idx, script_content in enumerate(script_storage):
-            placeholder = script_placeholder_template.format(idx)
-            result = result.replace(placeholder, script_content)
-        # Restore all style tags exactly as they were
-        for idx, style_content in enumerate(style_storage):
-            placeholder = style_placeholder_template.format(idx)
-            result = result.replace(placeholder, style_content)
-        # Post-process the entire HTML to fix bold/strong formatting
-        result = self.post_process_html(result)
-        # Validate and fix HTML syntax (but protect scripts)
-        result = self.validate_and_fix_html_safe(result)
-        print(f"Successfully processed {processed_count} text elements")
-        print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
-        return result
-    except Exception as e:
-        import traceback
-        error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        # Return original HTML with error message prepended as HTML comment
-        return f"<!-- {error_msg} -->\n{html_content}"
-def validate_and_fix_html_safe(self, html_text):
-    """Fix common HTML syntax errors after processing while protecting scripts"""
-    # First, extract and protect script content
-    script_pattern = r'<script[^>]*>.*?</script>'
-    scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
-    script_placeholders = {}
-    for i, script_content in enumerate(scripts):
-        placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
-        script_placeholders[placeholder] = script_content
-        html_text = html_text.replace(script_content, placeholder, 1)
-    # Also protect style tags
-    style_pattern = r'<style[^>]*>.*?</style>'
-    styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
-    style_placeholders = {}
-    for i, style_content in enumerate(styles):
-        placeholder = f"<!--PROTECTED_STYLE_{i}-->"
-        style_placeholders[placeholder] = style_content
-        html_text = html_text.replace(style_content, placeholder, 1)
-    # Fix DOCTYPE
-    html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
-    # Fix spacing issues (but not inside scripts/styles)
-    html_text = re.sub(r'>\s+<', '><', html_text)
-    html_text = re.sub(r'\s+>', '>', html_text)
-    html_text = re.sub(r'<\s+', '<', html_text)
-    # Fix common word errors that might occur during processing
-    html_text = html_text.replace('down loaded', 'downloaded')
-    html_text = html_text.replace('But your document', 'Your document')
-    # Restore protected scripts
-    for placeholder, script_content in script_placeholders.items():
-        html_text = html_text.replace(placeholder, script_content)
-    # Restore protected styles
-    for placeholder, style_content in style_placeholders.items():
-        html_text = html_text.replace(placeholder, style_content)
-    return html_text
     def extract_text_from_html(self, html_content):
-    """Extract text elements from HTML with skip logic"""
-    soup = BeautifulSoup(html_content, 'html.parser')
-    text_elements = []
-    # Get all text nodes using strings (the correct method)
-    for element in soup.strings:
-        # Skip if parent is script, style, or noscript
-        if element.parent.name in ['script', 'style', 'noscript']:
-            continue
-        text = element.strip()
-        # Skip empty strings
-        if not text:
-            continue
-        # Skip placeholder texts
-        if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
-            continue
-        # Use the existing should_skip_element logic
-        if not self.should_skip_element(element, text):
-            text_elements.append({
-                'text': text,
-                'element': element
-            })
-    return soup, text_elements
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""

         return soup, text_elements, script_placeholders
     def process_html(self, html_content, progress_callback=None):
+        """Main processing function with progress callback"""
+        if not html_content.strip():
+            return "Please provide HTML content."
+        # Parse the HTML first
+        soup_initial = BeautifulSoup(html_content, 'html.parser')
+        # Store ALL scripts (both external and inline) with their full content
+        script_storage = []
+        script_placeholder_template = "<!--SCRIPT_PLACEHOLDER_{}-->"
+        # Find and replace all script tags
+        for idx, script in enumerate(soup_initial.find_all('script')):
+            placeholder = script_placeholder_template.format(idx)
+            # Store the entire script tag as a string
+            script_storage.append(str(script))
+            # Replace with a comment placeholder
+            new_tag = soup_initial.new_string(placeholder)
+            script.replace_with(new_tag)
+        # Also store and replace style tags
+        style_storage = []
+        style_placeholder_template = "<!--STYLE_PLACEHOLDER_{}-->"
+        for idx, style in enumerate(soup_initial.find_all('style')):
+            placeholder = style_placeholder_template.format(idx)
+            style_storage.append(str(style))
+            new_tag = soup_initial.new_string(placeholder)
+            style.replace_with(new_tag)
+        # Get the modified HTML
+        html_content = str(soup_initial)
+        try:
+            # Extract text elements
+            soup, text_elements = self.extract_text_from_html(html_content)
+            total_elements = len(text_elements)
+            print(f"Found {total_elements} text elements to process (after filtering)")
+            # Process each text element
+            processed_count = 0
+            for i, element_info in enumerate(text_elements):
+                original_text = element_info['text']
+                # Skip placeholders
+                if "SCRIPT_PLACEHOLDER" in original_text or "STYLE_PLACEHOLDER" in original_text:
+                    continue
+                # Skip very short texts
+                if len(original_text.split()) < 3:
+                    continue
+                # First pass with Dipper
+                paraphrased_text = self.paraphrase_with_dipper(
+                    original_text,
+                    lex_diversity=60,
+                    order_diversity=20
+                )
+                # Second pass with BART for longer texts (balanced probability)
+                if self.use_bart and len(paraphrased_text.split()) > 8:
+                    # 30% chance to use BART for more variation (balanced)
+                    if random.random() < 0.3:
+                        paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
+                # Apply sentence variation
+                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+                # Add natural flow variations
+                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+                # Fix punctuation and formatting
+                paraphrased_text = self.fix_punctuation(paraphrased_text)
+                # Final quality check
+                if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                    element_info['element'].replace_with(NavigableString(paraphrased_text))
+                    processed_count += 1
+                # Progress update
+                if progress_callback:
+                    progress_callback(i + 1, total_elements)
+                if i % 10 == 0 or i == total_elements - 1:
+                    progress = (i + 1) / total_elements * 100
+                    print(f"Progress: {progress:.1f}%")
+            # Get the processed HTML
+            result = str(soup)
+            # Restore all script tags exactly as they were
+            for idx, script_content in enumerate(script_storage):
+                placeholder = script_placeholder_template.format(idx)
+                result = result.replace(placeholder, script_content)
+            # Restore all style tags exactly as they were
+            for idx, style_content in enumerate(style_storage):
+                placeholder = style_placeholder_template.format(idx)
+                result = result.replace(placeholder, style_content)
+            # Post-process the entire HTML to fix bold/strong formatting
+            result = self.post_process_html(result)
+            # Validate and fix HTML syntax (but protect scripts)
+            result = self.validate_and_fix_html_safe(result)
+            print(f"Successfully processed {processed_count} text elements")
+            print(f"Preserved {len(script_storage)} script tags and {len(style_storage)} style tags")
+            return result
+        except Exception as e:
+            import traceback
+            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            # Return original HTML with error message prepended as HTML comment
+            return f"<!-- {error_msg} -->\n{html_content}"
+    def validate_and_fix_html_safe(self, html_text):
+        """Fix common HTML syntax errors after processing while protecting scripts"""
+        # First, extract and protect script content
+        script_pattern = r'<script[^>]*>.*?</script>'
+        scripts = re.findall(script_pattern, html_text, re.DOTALL | re.IGNORECASE)
+        script_placeholders = {}
+        for i, script_content in enumerate(scripts):
+            placeholder = f"<!--PROTECTED_SCRIPT_{i}-->"
+            script_placeholders[placeholder] = script_content
+            html_text = html_text.replace(script_content, placeholder, 1)
+        # Also protect style tags
+        style_pattern = r'<style[^>]*>.*?</style>'
+        styles = re.findall(style_pattern, html_text, re.DOTALL | re.IGNORECASE)
+        style_placeholders = {}
+        for i, style_content in enumerate(styles):
+            placeholder = f"<!--PROTECTED_STYLE_{i}-->"
+            style_placeholders[placeholder] = style_content
+            html_text = html_text.replace(style_content, placeholder, 1)
+        # Fix DOCTYPE
+        html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+        # Fix spacing issues (but not inside scripts/styles)
+        html_text = re.sub(r'>\s+<', '><', html_text)
+        html_text = re.sub(r'\s+>', '>', html_text)
+        html_text = re.sub(r'<\s+', '<', html_text)
+        # Fix common word errors that might occur during processing
+        html_text = html_text.replace('down loaded', 'downloaded')
+        html_text = html_text.replace('But your document', 'Your document')
+        # Restore protected scripts
+        for placeholder, script_content in script_placeholders.items():
+            html_text = html_text.replace(placeholder, script_content)
+        # Restore protected styles
+        for placeholder, style_content in style_placeholders.items():
+            html_text = html_text.replace(placeholder, style_content)
+        return html_text
     def extract_text_from_html(self, html_content):
+        """Extract text elements from HTML with skip logic"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        text_elements = []
+        # Get all text nodes using strings (the correct method)
+        for element in soup.strings:
+            # Skip if parent is script, style, or noscript
+            if element.parent.name in ['script', 'style', 'noscript']:
+                continue
+            text = element.strip()
+            # Skip empty strings
+            if not text:
+                continue
+            # Skip placeholder texts
+            if "SCRIPT_PLACEHOLDER" in text or "STYLE_PLACEHOLDER" in text:
+                continue
+            # Use the existing should_skip_element logic
+            if not self.should_skip_element(element, text):
+                text_elements.append({
+                    'text': text,
+                    'element': element
+                })
+        return soup, text_elements
     def is_likely_acronym_or_proper_noun(self, word):
         """Check if a word is likely an acronym or part of a proper noun"""